Reduce discovery fan-out and pagination

Make discovery pool population respect provider priority while keeping Spotify strict, and reduce unnecessary request volume in the hot discovery paths.

- keep discovery fan-out source-priority aware
- preserve cache use where freshness is not required
- cap Spotify artist-album pagination in discovery and cache refresh paths
- keep incremental release checks to a single page, since they only need the newest releases
- add regression coverage for provider order, strict Spotify handling, and pagination caps
pull/304/head
Antti Kettunen 1 month ago
parent 0b60986f44
commit e447cf6ab0

@ -128,6 +128,8 @@ def get_album_tracks_for_source(source: str, album_id: str):
fetch = getattr(client, 'get_album_tracks_dict', None) if source == 'hydrabase' else getattr(client, 'get_album_tracks', None)
if not fetch:
return None
if source == 'spotify':
return fetch(album_id, allow_fallback=False)
return fetch(album_id)
except Exception:
return None

@ -451,6 +451,14 @@ class WatchlistScanner:
"""Return watchlist scan sources in the configured priority order."""
return list(get_source_priority(get_primary_source()))
def _discovery_source_priority(self) -> List[str]:
"""Return discovery sources in configured priority order.
Discovery pool writes only support Spotify, iTunes, and Deezer IDs, so
we filter the broader metadata priority list down to those sources.
"""
return [source for source in self._watchlist_source_priority() if source in {'spotify', 'itunes', 'deezer'}]
@staticmethod
def _artist_id_attribute_for_source(source: str) -> Optional[str]:
"""Return the watchlist artist attribute that stores the given source ID."""
@ -501,7 +509,10 @@ class WatchlistScanner:
return None
try:
search_results = client.search_artists(watchlist_artist.artist_name, limit=1)
search_kwargs = {'limit': 1}
if source == 'spotify':
search_kwargs['allow_fallback'] = False
search_results = client.search_artists(watchlist_artist.artist_name, **search_kwargs)
except Exception as e:
logger.debug("Could not search %s for %s: %s", source, watchlist_artist.artist_name, e)
return None
@ -552,7 +563,10 @@ class WatchlistScanner:
return None
try:
artist_data = client.get_artist(artist_id)
if source == 'spotify':
artist_data = client.get_artist(artist_id, allow_fallback=False)
else:
artist_data = client.get_artist(artist_id)
except Exception as e:
logger.debug("Could not fetch artist image for %s on %s: %s", watchlist_artist.artist_name, source, e)
return None
@ -566,7 +580,10 @@ class WatchlistScanner:
return None
try:
album_data = client.get_album(album_id)
if source == 'spotify':
album_data = client.get_album(album_id, allow_fallback=False)
else:
album_data = client.get_album(album_id)
except Exception as e:
logger.debug("Could not fetch album %s on %s: %s", album_id, source, e)
album_data = None
@ -666,6 +683,97 @@ class WatchlistScanner:
return image_url
return None
def _get_artist_albums_for_source(
self,
source: str,
artist_id: str,
album_type: str = 'album,single,ep',
limit: int = 50,
skip_cache: bool = True,
max_pages: int = 0,
) -> List[Any]:
"""Fetch artist albums for a specific source, keeping Spotify strict."""
client = get_client_for_source(source)
if not client or not artist_id or not hasattr(client, 'get_artist_albums'):
return []
try:
kwargs = {
'album_type': album_type,
'limit': limit,
}
if source == 'spotify':
kwargs['skip_cache'] = skip_cache
kwargs['max_pages'] = max_pages
kwargs['allow_fallback'] = False
return client.get_artist_albums(artist_id, **kwargs) or []
except Exception as e:
logger.debug("Could not fetch artist albums for %s on %s: %s", artist_id, source, e)
return []
def _get_artist_data_for_source(self, source: str, artist_id: str) -> Optional[Dict[str, Any]]:
"""Fetch artist metadata for a specific source, keeping Spotify strict."""
client = get_client_for_source(source)
if not client or not artist_id or not hasattr(client, 'get_artist'):
return None
try:
if source == 'spotify':
return client.get_artist(artist_id, allow_fallback=False)
return client.get_artist(artist_id)
except Exception as e:
logger.debug("Could not fetch artist data for %s on %s: %s", artist_id, source, e)
return None
def _search_albums_for_source(self, source: str, query: str, limit: int = 1):
"""Search albums for a specific source, keeping Spotify strict."""
client = get_client_for_source(source)
if not client or not hasattr(client, 'search_albums'):
return []
try:
if source == 'spotify':
return client.search_albums(query, limit=limit, allow_fallback=False) or []
return client.search_albums(query, limit=limit) or []
except Exception as e:
logger.debug("Could not search albums for %s on %s: %s", query, source, e)
return []
def _resolve_artist_id_for_source(
self,
source: str,
artist_name: str,
stored_id: Optional[str] = None,
cache_callback: Optional[Callable[[str], None]] = None,
) -> Optional[str]:
"""Resolve an artist ID for a specific source, searching by name if needed."""
if stored_id:
return stored_id
client = get_client_for_source(source)
if not client or not hasattr(client, 'search_artists'):
return None
try:
search_kwargs = {'limit': 1}
if source == 'spotify':
search_kwargs['allow_fallback'] = False
results = client.search_artists(artist_name, **search_kwargs)
except Exception as e:
logger.debug("Could not resolve %s artist ID for %s: %s", source, artist_name, e)
return None
if not results:
return None
found_id = self._extract_entity_id(results[0])
if found_id and cache_callback:
try:
cache_callback(found_id)
except Exception as e:
logger.debug("Could not cache %s artist ID for %s: %s", source, artist_name, e)
return found_id
def backfill_watchlist_artist_images(self, profile_id: int) -> int:
"""Backfill missing watchlist artist images using cached metadata and existing album art."""
try:
@ -2298,9 +2406,6 @@ class WatchlistScanner:
from datetime import datetime, timedelta
import random
if self.spotify_client and self.spotify_client.is_rate_limited():
self._disable_spotify_for_run("global Spotify rate limit active")
# Check if we should run discovery pool population (prevents over-polling)
skip_pool_population = not self.database.should_populate_discovery_pool(hours_threshold=24, profile_id=profile_id)
@ -2320,18 +2425,12 @@ class WatchlistScanner:
logger.info("Populating discovery pool from similar artists...")
# Determine which sources are available
spotify_available = self._spotify_is_primary_source()
# Import fallback metadata client (iTunes or Deezer)
itunes_client, fallback_source = _get_fallback_metadata_client()
fallback_available = True # Fallback source is always available (no auth needed)
if not spotify_available and not fallback_available:
discovery_sources = self._discovery_source_priority()
if not discovery_sources:
logger.warning("No music sources available to populate discovery pool")
return
logger.info(f"Sources available - Spotify: {spotify_available}, {fallback_source}: {fallback_available}")
logger.info("Discovery source priority: %s", discovery_sources)
# Get top similar artists for this profile's watchlist (ordered by occurrence_count)
similar_artists = self.database.get_top_similar_artists(limit=top_artists_limit, profile_id=profile_id)
@ -2360,233 +2459,172 @@ class WatchlistScanner:
if progress_callback:
progress_callback('artist', f'{similar_artist.similar_artist_name} ({artist_idx}/{len(similar_artists)})')
# Build list of sources to process for this artist
# Fallback source (iTunes/Deezer) is ALWAYS processed (baseline), Spotify is added if authenticated
sources_to_process = []
# Resolve the first source that can actually produce albums.
selected_source = None
selected_artist_id = None
selected_albums = []
artist_genres: List[str] = []
for source in discovery_sources:
source_attr = self._artist_id_attribute_for_source(source)
stored_id = getattr(similar_artist, source_attr, None) if source_attr else None
cache_callback = None
if source == 'itunes':
cache_callback = lambda found_id, artist_id=similar_artist.id: self.database.update_similar_artist_itunes_id(artist_id, found_id)
elif source == 'deezer':
cache_callback = lambda found_id, artist_id=similar_artist.id: self.database.update_similar_artist_deezer_id(artist_id, found_id)
artist_id = self._resolve_artist_id_for_source(
source,
similar_artist.similar_artist_name,
stored_id=stored_id,
cache_callback=cache_callback,
)
if not artist_id:
continue
# Always add fallback source first (baseline source)
fallback_id = similar_artist.similar_artist_itunes_id if fallback_source == 'itunes' else getattr(similar_artist, 'similar_artist_deezer_id', None)
if not fallback_id:
# On-the-fly lookup for missing fallback ID (seamless provider switching)
try:
fallback_results = itunes_client.search_artists(similar_artist.similar_artist_name, limit=1)
if fallback_results and len(fallback_results) > 0:
fallback_id = fallback_results[0].id
# Cache it for future use
if fallback_source == 'deezer':
self.database.update_similar_artist_deezer_id(similar_artist.id, fallback_id)
else:
self.database.update_similar_artist_itunes_id(similar_artist.id, fallback_id)
logger.debug(f" Resolved {fallback_source} ID {fallback_id} for {similar_artist.similar_artist_name}")
except Exception as e:
logger.debug(f" Could not resolve {fallback_source} ID for {similar_artist.similar_artist_name}: {e}")
all_albums = self._get_artist_albums_for_source(
source,
artist_id,
album_type='album,single,ep',
limit=50,
skip_cache=False,
max_pages=2,
)
if not all_albums:
logger.debug(f"No albums found for {similar_artist.similar_artist_name} on {source}")
continue
if fallback_id:
sources_to_process.append((fallback_source, fallback_id))
artist_data = self._get_artist_data_for_source(source, artist_id)
if artist_data and 'genres' in artist_data:
artist_genres = artist_data['genres']
# Add Spotify if authenticated and we have an ID
if spotify_available and similar_artist.similar_artist_spotify_id:
sources_to_process.append(('spotify', similar_artist.similar_artist_spotify_id))
albums = [a for a in all_albums if hasattr(a, 'album_type') and a.album_type == 'album']
singles_eps = [a for a in all_albums if hasattr(a, 'album_type') and a.album_type in ['single', 'ep']]
selected_albums = []
if not sources_to_process:
logger.debug(f"No valid IDs for {similar_artist.similar_artist_name}, skipping")
continue
latest_releases = all_albums[:3]
selected_albums.extend(latest_releases)
logger.debug(f" Processing {len(sources_to_process)} source(s): {[s[0] for s in sources_to_process]}")
remaining_slots = albums_per_artist - len(selected_albums)
if remaining_slots > 0:
remaining_content = all_albums[3:]
if len(remaining_content) > remaining_slots:
selected_albums.extend(random.sample(remaining_content, remaining_slots))
else:
selected_albums.extend(remaining_content)
# Process each source for this artist
for source, artist_id in sources_to_process:
try:
# Get artist's albums from this source
if source == 'spotify':
all_albums = self.spotify_client.get_artist_albums(
artist_id,
album_type='album,single,ep',
limit=50,
skip_cache=True,
)
else: # itunes or deezer fallback
all_albums = itunes_client.get_artist_albums(
artist_id,
album_type='album,single,ep',
limit=50
)
if not all_albums:
logger.debug(f"No albums found for {similar_artist.similar_artist_name} on {source}")
continue
selected_source = source
selected_artist_id = artist_id
logger.info(
f" [{source}] Selected {len(selected_albums)} releases from {len(all_albums)} available "
f"(albums: {len(albums)}, singles/EPs: {len(singles_eps)})"
)
break
# Fetch artist genres for this source
artist_genres = []
try:
if source == 'spotify':
artist_data = self.spotify_client.get_artist(artist_id)
if artist_data and 'genres' in artist_data:
artist_genres = artist_data['genres']
else: # itunes/deezer - genres from artist lookup
artist_data = itunes_client.get_artist(artist_id)
if artist_data and 'genres' in artist_data:
artist_genres = artist_data['genres']
except Exception as e:
logger.debug(f"Could not fetch genres for {similar_artist.similar_artist_name} on {source}: {e}")
if not selected_source or not selected_artist_id or not selected_albums:
logger.debug(f"No valid source/albums for {similar_artist.similar_artist_name}, skipping")
continue
# IMPROVED: Smart selection mixing albums, singles, and EPs
# Prioritize recent releases and popular content
# Process each selected album from the winning source.
for album_idx, album in enumerate(selected_albums, 1):
try:
album_data = self._get_album_data_for_source(selected_source, album.id, album_name=album.name)
if not album_data:
continue
# Separate by type for balanced selection
albums = [a for a in all_albums if hasattr(a, 'album_type') and a.album_type == 'album']
singles_eps = [a for a in all_albums if hasattr(a, 'album_type') and a.album_type in ['single', 'ep']]
other = [a for a in all_albums if not hasattr(a, 'album_type')]
tracks = self._extract_track_items(album_data)
logger.debug(f" Album {album_idx}: {album_data.get('name', 'Unknown')} ({len(tracks)} tracks)")
# Select albums: latest releases + popular older content
selected_albums = []
if self._has_placeholder_tracks(tracks):
logger.info(f" Skipping album with placeholder tracks: {album_data.get('name', 'Unknown')}")
continue
# Always include 3 most recent releases (any type) - this captures new singles/EPs
latest_releases = all_albums[:3]
selected_albums.extend(latest_releases)
is_new = False
try:
release_date_str = album_data.get('release_date', '')
if release_date_str and len(release_date_str) >= 10:
release_date = datetime.strptime(release_date_str[:10], "%Y-%m-%d")
is_new = (datetime.now() - release_date).days <= 30
except Exception:
pass
# Add remaining slots with balanced mix
remaining_slots = albums_per_artist - len(selected_albums)
if remaining_slots > 0:
# Combine remaining albums and singles
remaining_content = all_albums[3:]
for track in tracks:
try:
enhanced_track = {
**track,
'album': {
'id': album_data['id'],
'name': album_data.get('name', 'Unknown Album'),
'images': album_data.get('images', []),
'release_date': album_data.get('release_date', ''),
'album_type': album_data.get('album_type', 'album'),
'total_tracks': album_data.get('total_tracks', 0)
},
'_source': selected_source
}
if len(remaining_content) > remaining_slots:
# Randomly select from remaining content
random_selection = random.sample(remaining_content, remaining_slots)
selected_albums.extend(random_selection)
else:
selected_albums.extend(remaining_content)
raw_popularity = album_data.get('popularity', 0)
if selected_source in ('itunes', 'deezer') and raw_popularity == 0:
synth_pop = 45
if is_new:
synth_pop += 25
else:
try:
release_str = album_data.get('release_date', '')
if release_str and len(release_str) >= 10:
rel_date = datetime.strptime(release_str[:10], "%Y-%m-%d")
age_days = (datetime.now() - rel_date).days
if age_days <= 90:
synth_pop += 15
elif age_days <= 365:
synth_pop += 5
except Exception:
pass
if similar_artist.occurrence_count >= 3:
synth_pop += 10
elif similar_artist.occurrence_count >= 2:
synth_pop += 5
raw_popularity = min(synth_pop, 100)
logger.info(f" [{source}] Selected {len(selected_albums)} releases from {len(all_albums)} available (albums: {len(albums)}, singles/EPs: {len(singles_eps)})")
track_data = {
'track_name': track.get('name', 'Unknown Track'),
'artist_name': similar_artist.similar_artist_name,
'album_name': album_data.get('name', 'Unknown Album'),
'album_cover_url': album_data.get('images', [{}])[0].get('url') if album_data.get('images') else None,
'duration_ms': track.get('duration_ms', 0),
'popularity': raw_popularity,
'release_date': album_data.get('release_date', ''),
'is_new_release': is_new,
'track_data_json': enhanced_track,
'artist_genres': artist_genres
}
# Process each selected album
for album_idx, album in enumerate(selected_albums, 1):
try:
# Get full album data with tracks from appropriate source
if source == 'spotify':
album_data = self.spotify_client.get_album(album.id)
if not album_data or 'tracks' not in album_data:
continue
tracks = album_data['tracks'].get('items', [])
else: # itunes or deezer fallback
album_data = itunes_client.get_album(album.id)
if not album_data:
continue
# get_album includes tracks by default (include_tracks=True)
tracks = album_data.get('tracks', {}).get('items', [])
logger.debug(f" Album {album_idx}: {album_data.get('name', 'Unknown')} ({len(tracks)} tracks)")
# Skip albums with placeholder tracks (unreleased tracklist)
if self._has_placeholder_tracks(tracks):
logger.info(f" Skipping album with placeholder tracks: {album_data.get('name', 'Unknown')}")
continue
if selected_source == 'spotify':
track_data['spotify_track_id'] = track.get('id')
track_data['spotify_album_id'] = album_data.get('id')
track_data['spotify_artist_id'] = selected_artist_id
elif selected_source == 'deezer':
track_data['deezer_track_id'] = track.get('id')
track_data['deezer_album_id'] = album_data.get('id')
track_data['deezer_artist_id'] = selected_artist_id
else:
track_data['itunes_track_id'] = track.get('id')
track_data['itunes_album_id'] = album_data.get('id')
track_data['itunes_artist_id'] = selected_artist_id
# Determine if this is a new release (within last 30 days)
is_new = False
try:
release_date_str = album_data.get('release_date', '')
if release_date_str:
# Handle full date or year-only
if len(release_date_str) >= 10:
release_date = datetime.strptime(release_date_str[:10], "%Y-%m-%d")
days_old = (datetime.now() - release_date).days
is_new = days_old <= 30
except:
pass
# Add each track to discovery pool
for track in tracks:
try:
# Enhance track object with full album data (including album_type)
enhanced_track = {
**track,
'album': {
'id': album_data['id'],
'name': album_data.get('name', 'Unknown Album'),
'images': album_data.get('images', []),
'release_date': album_data.get('release_date', ''),
'album_type': album_data.get('album_type', 'album'),
'total_tracks': album_data.get('total_tracks', 0)
},
'_source': source
}
# Build track data for discovery pool with source-specific IDs
# iTunes/Deezer have no popularity data — synthesize from recency + occurrence
raw_popularity = album_data.get('popularity', 0)
if source in ('itunes', 'deezer') and raw_popularity == 0:
# Base 45, boost by recency and artist occurrence count
synth_pop = 45
if is_new:
synth_pop += 25 # New releases get a big boost
else:
try:
release_str = album_data.get('release_date', '')
if release_str and len(release_str) >= 10:
rel_date = datetime.strptime(release_str[:10], "%Y-%m-%d")
age_days = (datetime.now() - rel_date).days
if age_days <= 90:
synth_pop += 15
elif age_days <= 365:
synth_pop += 5
except:
pass
# Artists that appear similar to multiple watchlist artists are likely more relevant
if similar_artist.occurrence_count >= 3:
synth_pop += 10
elif similar_artist.occurrence_count >= 2:
synth_pop += 5
raw_popularity = min(synth_pop, 100)
track_data = {
'track_name': track.get('name', 'Unknown Track'),
'artist_name': similar_artist.similar_artist_name,
'album_name': album_data.get('name', 'Unknown Album'),
'album_cover_url': album_data.get('images', [{}])[0].get('url') if album_data.get('images') else None,
'duration_ms': track.get('duration_ms', 0),
'popularity': raw_popularity,
'release_date': album_data.get('release_date', ''),
'is_new_release': is_new,
'track_data_json': enhanced_track,
'artist_genres': artist_genres
}
# Add source-specific IDs
if source == 'spotify':
track_data['spotify_track_id'] = track.get('id')
track_data['spotify_album_id'] = album_data.get('id')
track_data['spotify_artist_id'] = similar_artist.similar_artist_spotify_id
elif source == 'deezer':
track_data['deezer_track_id'] = track.get('id')
track_data['deezer_album_id'] = album_data.get('id')
track_data['deezer_artist_id'] = getattr(similar_artist, 'similar_artist_deezer_id', None)
else: # itunes
track_data['itunes_track_id'] = track.get('id')
track_data['itunes_album_id'] = album_data.get('id')
track_data['itunes_artist_id'] = similar_artist.similar_artist_itunes_id
# Add to discovery pool with source (scoped to profile)
if self.database.add_to_discovery_pool(track_data, source=source, profile_id=profile_id):
total_tracks_added += 1
except Exception as track_error:
logger.debug(f"Error adding track to discovery pool: {track_error}")
continue
# Small delay between albums
time.sleep(DELAY_BETWEEN_ALBUMS)
except Exception as album_error:
logger.warning(f"Error processing album on {source}: {album_error}")
if self.database.add_to_discovery_pool(track_data, source=selected_source, profile_id=profile_id):
total_tracks_added += 1
except Exception as track_error:
logger.debug(f"Error adding track to discovery pool: {track_error}")
continue
except Exception as source_error:
logger.warning(f"Error processing {source} source for {similar_artist.similar_artist_name}: {source_error}")
time.sleep(DELAY_BETWEEN_ALBUMS)
except Exception as album_error:
logger.warning(f"Error processing album on {selected_source}: {album_error}")
continue
# Delay between artists (after processing all sources for this artist)
if artist_idx < len(similar_artists):
time.sleep(DELAY_BETWEEN_ARTISTS)
@ -2625,67 +2663,48 @@ class WatchlistScanner:
db_source = None
artist_id_for_genres = None
# Try Spotify first if available
if spotify_available:
for source in discovery_sources:
try:
search_results = self.spotify_client.search_albums(
f"album:{album_row['title']} artist:{album_row['artist_name']}",
limit=1,
allow_fallback=False,
)
if search_results and len(search_results) > 0:
spotify_album = search_results[0]
album_data = self.spotify_client.get_album(spotify_album.id)
if album_data and 'tracks' in album_data:
tracks = album_data['tracks'].get('items', [])
db_source = 'spotify'
if album_data.get('artists'):
artist_id_for_genres = album_data['artists'][0]['id']
except Exception as e:
logger.debug(f"Spotify search failed for {album_row['title']}: {e}")
search_query = query if source != 'spotify' else f"album:{album_row['title']} artist:{album_row['artist_name']}"
search_results = self._search_albums_for_source(source, search_query, limit=1)
if not search_results:
continue
# Fall back to fallback source (iTunes/Deezer) if Spotify didn't work
if not tracks and fallback_available:
try:
search_results = itunes_client.search_albums(query, limit=1)
if search_results and len(search_results) > 0:
fallback_album = search_results[0]
album_data = itunes_client.get_album(fallback_album.id)
if album_data:
tracks_data = itunes_client.get_album_tracks(fallback_album.id)
tracks = tracks_data.get('items', []) if tracks_data else []
db_source = fallback_source
# Artist ID is in the album data
if album_data.get('artists'):
artist_id_for_genres = album_data['artists'][0].get('id')
album_candidate = search_results[0]
album_data = self._get_album_data_for_source(source, album_candidate.id, album_name=album_row['title'])
if not album_data:
continue
tracks = self._extract_track_items(album_data)
if not tracks:
continue
db_source = source
if album_data.get('artists'):
artist_id_for_genres = album_data['artists'][0].get('id')
break
except Exception as e:
logger.debug(f"{fallback_source} search failed for {album_row['title']}: {e}")
logger.debug(f"{source} search failed for {album_row['title']}: {e}")
if not tracks or not album_data:
continue
# Fetch artist genres
artist_genres = []
try:
if artist_id_for_genres:
if db_source == 'spotify':
artist_data = self.spotify_client.get_artist(artist_id_for_genres)
else:
artist_data = itunes_client.get_artist(artist_id_for_genres)
artist_data = self._get_artist_data_for_source(db_source, artist_id_for_genres)
if artist_data and 'genres' in artist_data:
artist_genres = artist_data['genres']
except Exception as e:
logger.debug(f"Could not fetch genres for album artist: {e}")
# Check if new release
is_new = False
try:
release_date_str = album_data.get('release_date', '')
if release_date_str and len(release_date_str) >= 10:
release_date = datetime.strptime(release_date_str[:10], "%Y-%m-%d")
days_old = (datetime.now() - release_date).days
is_new = days_old <= 30
except:
is_new = (datetime.now() - release_date).days <= 30
except Exception:
pass
for track in tracks:
@ -2716,7 +2735,6 @@ class WatchlistScanner:
'artist_genres': artist_genres
}
# Add source-specific IDs
if db_source == 'spotify':
track_data['spotify_track_id'] = track.get('id')
track_data['spotify_album_id'] = album_data.get('id')
@ -2725,14 +2743,14 @@ class WatchlistScanner:
track_data['deezer_track_id'] = track.get('id')
track_data['deezer_album_id'] = album_data.get('id')
track_data['deezer_artist_id'] = artist_id_for_genres or ''
else: # itunes
else:
track_data['itunes_track_id'] = track.get('id')
track_data['itunes_album_id'] = album_data.get('id')
track_data['itunes_artist_id'] = artist_id_for_genres or ''
if self.database.add_to_discovery_pool(track_data, source=db_source, profile_id=profile_id):
total_tracks_added += 1
except Exception as track_error:
except Exception:
continue
time.sleep(DELAY_BETWEEN_ALBUMS)
@ -2819,6 +2837,7 @@ class WatchlistScanner:
album_type='album,single,ep',
limit=5,
skip_cache=True,
max_pages=1,
)
if not recent_releases:
@ -3060,6 +3079,7 @@ class WatchlistScanner:
album_type='album,single,ep',
limit=20,
skip_cache=True,
max_pages=2,
)
for album in albums or []:
process_album(album, artist.artist_name, artist.spotify_artist_id, fallback_id if fallback_source == 'itunes' else None, 'spotify')
@ -3117,6 +3137,7 @@ class WatchlistScanner:
album_type='album,single,ep',
limit=20,
skip_cache=True,
max_pages=2,
)
for album in albums or []:
process_album(album, artist.similar_artist_name, artist.similar_artist_spotify_id, fallback_id if fallback_source == 'itunes' else None, 'spotify')

@ -76,11 +76,14 @@ class _FakeMetadataService:
class _FakeSourceClient:
def __init__(self, *, artist_id: str, albums, image_url: str):
def __init__(self, *, artist_id: str, albums, image_url: str, album_payload=None, album_search_results=None):
self.artist_id = artist_id
self.albums = list(albums)
self.image_url = image_url
self.album_payload = album_payload
self.album_search_results = list(album_search_results or [])
self.search_calls = []
self.search_album_calls = []
self.album_calls = []
self.artist_calls = []
@ -88,22 +91,41 @@ class _FakeSourceClient:
self.search_calls.append((query, limit, kwargs))
return [types.SimpleNamespace(id=self.artist_id, name=query)]
def search_albums(self, query, limit=1, **kwargs):
self.search_album_calls.append((query, limit, kwargs))
return list(self.album_search_results)
def get_artist_albums(self, artist_id, album_type='album,single', limit=50, **kwargs):
self.album_calls.append((artist_id, album_type, limit, kwargs))
return list(self.albums)
def get_artist(self, artist_id):
def get_artist(self, artist_id, **kwargs):
self.artist_calls.append(artist_id)
return {
"id": artist_id,
"images": [{"url": self.image_url}] if self.image_url else [],
}
def get_album(self, album_id, **kwargs):
self.album_calls.append((album_id, kwargs))
if self.album_payload is not None:
return self.album_payload
return {
"id": album_id,
"name": "Album One",
"images": [{"url": self.image_url}] if self.image_url else [],
"tracks": {"items": []},
"artists": [{"id": self.artist_id}],
}
class _FakeDB:
def __init__(self, artists):
self.artists = artists
self.similar_calls = []
self.discovery_pool_calls = []
self.discovery_pool_timestamp_calls = []
self.db_albums = []
def get_watchlist_artists(self, profile_id=None):
return list(self.artists)
@ -112,6 +134,52 @@ class _FakeDB:
self.similar_calls.append((args, kwargs))
return False
def should_populate_discovery_pool(self, hours_threshold=24, profile_id=1):
return True
def get_top_similar_artists(self, limit=50, profile_id=1):
return []
def add_to_discovery_pool(self, track_data, source, profile_id=1):
self.discovery_pool_calls.append((track_data, source, profile_id))
return True
def cleanup_old_discovery_tracks(self, days_threshold=365):
return 0
def update_discovery_pool_timestamp(self, track_count, profile_id=1):
self.discovery_pool_timestamp_calls.append((track_count, profile_id))
return True
class _Cursor:
def __init__(self, parent):
self.parent = parent
def execute(self, *args, **kwargs):
return None
def fetchall(self):
return list(self.parent.db_albums)
def fetchone(self):
return {"count": 0}
class _Conn:
def __init__(self, cursor):
self._cursor = cursor
def __enter__(self):
return self
def __exit__(self, exc_type, exc, tb):
return False
def cursor(self):
return self._cursor
def _get_connection(self):
return self._Conn(self._Cursor(self))
def _build_artist(name="Artist One", profile_id=11):
return types.SimpleNamespace(
@ -384,6 +452,269 @@ def test_get_artist_discography_for_watchlist_falls_back_when_primary_empty(monk
assert spotify_client.album_calls
def test_populate_discovery_pool_uses_primary_source_first(monkeypatch):
monkeypatch.setattr(watchlist_scanner_module, "DELAY_BETWEEN_ARTISTS", 0)
monkeypatch.setattr(watchlist_scanner_module, "DELAY_BETWEEN_ALBUMS", 0)
monkeypatch.setattr(watchlist_scanner_module, "time", types.SimpleNamespace(sleep=lambda *_args, **_kwargs: None))
monkeypatch.setattr(watchlist_scanner_module, "get_primary_source", lambda: "deezer")
monkeypatch.setattr(watchlist_scanner_module, "get_source_priority", lambda primary: [primary, "spotify", "itunes"])
similar_artist = types.SimpleNamespace(
id=501,
similar_artist_name="Similar Artist",
occurrence_count=3,
similar_artist_spotify_id="sp-artist",
similar_artist_itunes_id="it-artist",
similar_artist_deezer_id="dz-artist",
)
album = types.SimpleNamespace(id="dz-album-1", name="Deezer Album", album_type="album")
deezer_album_payload = {
"id": "dz-album-1",
"name": "Deezer Album",
"images": [{"url": "https://example.com/deezer-album.jpg"}],
"release_date": "2026-04-01",
"popularity": 0,
"tracks": {
"items": [
{
"id": "dz-track-1",
"name": "Deezer Track",
"duration_ms": 123456,
"artists": [{"name": "Similar Artist"}],
}
]
},
"artists": [{"id": "dz-artist"}],
}
deezer_client = _FakeSourceClient(
artist_id="dz-artist",
albums=[album],
image_url="https://example.com/deezer-artist.jpg",
album_payload=deezer_album_payload,
)
spotify_client = _FakeSourceClient(
artist_id="sp-artist",
albums=[types.SimpleNamespace(id="sp-album-1", name="Spotify Album", album_type="album")],
image_url="https://example.com/spotify-artist.jpg",
album_payload={
"id": "sp-album-1",
"name": "Spotify Album",
"images": [{"url": "https://example.com/spotify-album.jpg"}],
"release_date": "2026-04-01",
"popularity": 50,
"tracks": {"items": []},
"artists": [{"id": "sp-artist"}],
},
)
def fake_get_client_for_source(source):
return {
"deezer": deezer_client,
"spotify": spotify_client,
}.get(source)
monkeypatch.setattr(watchlist_scanner_module, "get_client_for_source", fake_get_client_for_source)
scanner = _build_scanner({"tracks": {"items": []}}, [])
scanner._database.has_fresh_similar_artists = lambda *args, **kwargs: False
scanner.database.should_populate_discovery_pool = lambda hours_threshold=24, profile_id=1: True
scanner.database.get_top_similar_artists = lambda limit=50, profile_id=1: [similar_artist]
scanner.database.db_albums = []
scanner.cache_discovery_recent_albums = lambda *args, **kwargs: None
scanner.curate_discovery_playlists = lambda *args, **kwargs: None
scanner.database.update_discovery_pool_timestamp = lambda *args, **kwargs: True
scanner.database.cleanup_old_discovery_tracks = lambda *args, **kwargs: 0
scanner.populate_discovery_pool(top_artists_limit=1, albums_per_artist=1, profile_id=1)
assert scanner.database.discovery_pool_calls
assert scanner.database.discovery_pool_calls[0][1] == "deezer"
assert deezer_client.album_calls
assert spotify_client.search_calls == []
assert spotify_client.artist_calls == []
def test_populate_discovery_pool_falls_back_to_spotify_when_primary_has_no_albums(monkeypatch):
monkeypatch.setattr(watchlist_scanner_module, "DELAY_BETWEEN_ARTISTS", 0)
monkeypatch.setattr(watchlist_scanner_module, "DELAY_BETWEEN_ALBUMS", 0)
monkeypatch.setattr(watchlist_scanner_module, "time", types.SimpleNamespace(sleep=lambda *_args, **_kwargs: None))
monkeypatch.setattr(watchlist_scanner_module, "get_primary_source", lambda: "deezer")
monkeypatch.setattr(watchlist_scanner_module, "get_source_priority", lambda primary: [primary, "spotify", "itunes"])
similar_artist = types.SimpleNamespace(
id=502,
similar_artist_name="Fallback Artist",
occurrence_count=1,
similar_artist_spotify_id="sp-artist",
similar_artist_itunes_id="it-artist",
similar_artist_deezer_id="dz-artist",
)
deezer_client = _FakeSourceClient(
artist_id="dz-artist",
albums=[],
image_url="https://example.com/deezer-artist.jpg",
)
spotify_album = types.SimpleNamespace(id="sp-album-1", name="Spotify Album", album_type="album")
spotify_client = _FakeSourceClient(
artist_id="sp-artist",
albums=[spotify_album],
image_url="https://example.com/spotify-artist.jpg",
album_payload={
"id": "sp-album-1",
"name": "Spotify Album",
"images": [{"url": "https://example.com/spotify-album.jpg"}],
"release_date": "2026-04-01",
"popularity": 50,
"tracks": {
"items": [
{
"id": "sp-track-1",
"name": "Spotify Track",
"duration_ms": 234567,
"artists": [{"name": "Fallback Artist"}],
}
]
},
"artists": [{"id": "sp-artist"}],
},
)
def fake_get_client_for_source(source):
return {
"deezer": deezer_client,
"spotify": spotify_client,
}.get(source)
monkeypatch.setattr(watchlist_scanner_module, "get_client_for_source", fake_get_client_for_source)
scanner = _build_scanner({"tracks": {"items": []}}, [])
scanner._database.has_fresh_similar_artists = lambda *args, **kwargs: False
scanner.database.should_populate_discovery_pool = lambda hours_threshold=24, profile_id=1: True
scanner.database.get_top_similar_artists = lambda limit=50, profile_id=1: [similar_artist]
scanner.database.db_albums = []
scanner.cache_discovery_recent_albums = lambda *args, **kwargs: None
scanner.curate_discovery_playlists = lambda *args, **kwargs: None
scanner.database.update_discovery_pool_timestamp = lambda *args, **kwargs: True
scanner.database.cleanup_old_discovery_tracks = lambda *args, **kwargs: 0
scanner.populate_discovery_pool(top_artists_limit=1, albums_per_artist=1, profile_id=1)
assert scanner.database.discovery_pool_calls
assert scanner.database.discovery_pool_calls[0][1] == "spotify"
assert deezer_client.album_calls
assert spotify_client.search_calls == [("Fallback Artist", 1, {"allow_fallback": False})]
assert spotify_client.album_calls
assert any(
isinstance(call, tuple)
and len(call) == 4
and call[0] == "sp-artist"
and call[3].get("skip_cache") is False
and call[3].get("allow_fallback") is False
and call[3].get("max_pages") == 2
for call in spotify_client.album_calls
)
assert any(
isinstance(call, tuple)
and len(call) == 4
and call[3].get("allow_fallback") is False
for call in spotify_client.album_calls
)
def test_populate_discovery_pool_uses_strict_spotify_for_database_album_search(monkeypatch):
monkeypatch.setattr(watchlist_scanner_module, "DELAY_BETWEEN_ARTISTS", 0)
monkeypatch.setattr(watchlist_scanner_module, "DELAY_BETWEEN_ALBUMS", 0)
monkeypatch.setattr(watchlist_scanner_module, "time", types.SimpleNamespace(sleep=lambda *_args, **_kwargs: None))
monkeypatch.setattr(watchlist_scanner_module, "get_primary_source", lambda: "deezer")
monkeypatch.setattr(watchlist_scanner_module, "get_source_priority", lambda primary: [primary, "spotify", "itunes"])
similar_artist = types.SimpleNamespace(
id=503,
similar_artist_name="No Album Artist",
occurrence_count=1,
similar_artist_spotify_id="sp-artist",
similar_artist_itunes_id="it-artist",
similar_artist_deezer_id="dz-artist",
)
deezer_client = _FakeSourceClient(
artist_id="dz-artist",
albums=[],
image_url="https://example.com/deezer-artist.jpg",
)
spotify_client = _FakeSourceClient(
artist_id="sp-artist",
albums=[],
image_url="https://example.com/spotify-artist.jpg",
album_search_results=[types.SimpleNamespace(id="sp-db-album", name="DB Album")],
album_payload={
"id": "sp-db-album",
"name": "DB Album",
"images": [{"url": "https://example.com/db-album.jpg"}],
"release_date": "2026-04-01",
"popularity": 75,
"tracks": {
"items": [
{
"id": "sp-db-track-1",
"name": "DB Track",
"duration_ms": 345678,
"artists": [{"name": "DB Artist"}],
}
]
},
"artists": [{"id": "sp-artist"}],
},
)
def fake_get_client_for_source(source):
return {
"deezer": deezer_client,
"spotify": spotify_client,
}.get(source)
monkeypatch.setattr(watchlist_scanner_module, "get_client_for_source", fake_get_client_for_source)
scanner = _build_scanner({"tracks": {"items": []}}, [])
scanner._database.has_fresh_similar_artists = lambda *args, **kwargs: False
scanner.database.should_populate_discovery_pool = lambda hours_threshold=24, profile_id=1: True
scanner.database.get_top_similar_artists = lambda limit=50, profile_id=1: [similar_artist]
scanner.database.db_albums = [{"title": "DB Album", "artist_name": "DB Artist"}]
scanner.cache_discovery_recent_albums = lambda *args, **kwargs: None
scanner.curate_discovery_playlists = lambda *args, **kwargs: None
scanner.database.update_discovery_pool_timestamp = lambda *args, **kwargs: True
scanner.database.cleanup_old_discovery_tracks = lambda *args, **kwargs: 0
scanner.populate_discovery_pool(top_artists_limit=1, albums_per_artist=1, profile_id=1)
assert scanner.database.discovery_pool_calls
assert scanner.database.discovery_pool_calls[0][1] == "spotify"
assert spotify_client.search_album_calls
assert any(
kwargs.get("allow_fallback") is False
for _, _, kwargs in spotify_client.search_album_calls
)
assert any(
isinstance(call, tuple)
and len(call) == 4
and call[0] == "sp-artist"
and call[3].get("skip_cache") is False
and call[3].get("allow_fallback") is False
and call[3].get("max_pages") == 2
for call in spotify_client.album_calls
)
assert any(
isinstance(call, tuple)
and len(call) == 2
and call[1].get("allow_fallback") is False
for call in spotify_client.album_calls
if len(call) == 2
)
def test_match_to_spotify_uses_strict_lookup():
spotify_client = _FakeSpotifyClient(
search_results=[types.SimpleNamespace(id="fallback-id", name="Artist One")]

Loading…
Cancel
Save