Reduce watchlist Spotify API calls ~90% + configurable rate interval

Addresses all three points from community rate-limiting report: 1. Watchlist scans fetched ALL albums then filtered — 262 albums = 27 API calls per artist. Now determines upfront if full discography is needed: subsequent scans and time-bounded lookbacks use max_pages=1 (1 API call). Only "full discography" global setting fetches all. 2. MIN_API_INTERVAL (350ms) now configurable via spotify.min_api_interval setting. Users who get rate-limited frequently can increase the delay. Floor at 100ms to prevent abuse. 3. Retry-After header extraction improved: added diagnostic logging when headers exist but lack Retry-After key, plus regex fallback to parse the value from the error message string.
2 weeks ago · 4e4f258d25
parent 30d5f76e3d
commit 4e4f258d25
3 changed files with 111 additions and 51 deletions
--- a/core/spotify_client.py
+++ b/core/spotify_client.py
@ -14,7 +14,18 @@ logger = get_logger("spotify_client")
 # Global rate limiting variables
 _last_api_call_time = 0
 _api_call_lock = threading.Lock()
-MIN_API_INTERVAL = 0.35  # 350ms between API calls (~171/min, under Spotify's ~180/min limit)
+MIN_API_INTERVAL = 0.35  # Default: 350ms between API calls (~171/min, under Spotify's ~180/min limit)
+
+def _get_min_api_interval():
+    """Get configurable API interval from settings, falling back to default."""
+    try:
+        from config.settings import config_manager
+        val = config_manager.get('spotify.min_api_interval', None)
+        if val is not None:
+            return max(0.1, float(val))  # Floor at 100ms to prevent abuse
+    except Exception:
+        pass
+    return MIN_API_INTERVAL

 # Request queuing for burst handling
 import queue
@ -178,8 +189,23 @@ def _detect_and_set_rate_limit(exception, endpoint_name="unknown"):
        # Try to extract Retry-After from exception headers
        retry_after = None
        has_real_header = False
-        if hasattr(exception, 'headers') and exception.headers:
-            retry_after = exception.headers.get('Retry-After') or exception.headers.get('retry-after')
+
+        # Method 1: SpotifyException.headers (set by spotipy with retries=0)
+        exc_headers = getattr(exception, 'headers', None)
+        if exc_headers and hasattr(exc_headers, 'get'):
+            retry_after = exc_headers.get('Retry-After') or exc_headers.get('retry-after')
+            if retry_after:
+                logger.info(f"Extracted Retry-After from exception headers: {retry_after}")
+            else:
+                logger.debug(f"Exception has headers but no Retry-After key. Headers type: {type(exc_headers).__name__}, keys: {list(exc_headers.keys())[:10] if hasattr(exc_headers, 'keys') else 'N/A'}")
+
+        # Method 2: Parse from error message (some spotipy versions embed it)
+        if not retry_after:
+            import re
+            ra_match = re.search(r'[Rr]etry[- ][Aa]fter[:\s]+(\d+)', error_str)
+            if ra_match:
+                retry_after = ra_match.group(1)
+                logger.info(f"Extracted Retry-After from error message: {retry_after}")

        if retry_after:
            try:
@ -224,13 +250,14 @@ def rate_limited(func):
            if _is_globally_rate_limited():
                raise SpotifyRateLimitError(0, func.__name__)

-            # Enforce minimum interval between API calls
+            # Enforce minimum interval between API calls (configurable via settings)
+            _interval = _get_min_api_interval()
            with _api_call_lock:
                current_time = time.time()
                time_since_last_call = current_time - _last_api_call_time

-                if time_since_last_call < MIN_API_INTERVAL:
-                    sleep_time = MIN_API_INTERVAL - time_since_last_call
+                if time_since_last_call < _interval:
+                    sleep_time = _interval - time_since_last_call
                    time.sleep(sleep_time)

                _last_api_call_time = time.time()
@ -681,8 +708,9 @@ class SpotifyClient:
                if results['next']:
                    with _api_call_lock:
                        elapsed = time.time() - _last_api_call_time
-                        if elapsed < MIN_API_INTERVAL:
-                            time.sleep(MIN_API_INTERVAL - elapsed)
+                        _pi = _get_min_api_interval()
+                        if elapsed < _pi:
+                            time.sleep(_pi - elapsed)
                        globals()['_last_api_call_time'] = time.time()
                    from core.api_call_tracker import api_call_tracker
                    api_call_tracker.record_call('spotify', endpoint='get_user_playlists_page')
@ -960,8 +988,9 @@ class SpotifyClient:
                if results['next']:
                    with _api_call_lock:
                        elapsed = time.time() - _last_api_call_time
-                        if elapsed < MIN_API_INTERVAL:
-                            time.sleep(MIN_API_INTERVAL - elapsed)
+                        _pi = _get_min_api_interval()
+                        if elapsed < _pi:
+                            time.sleep(_pi - elapsed)
                        globals()['_last_api_call_time'] = time.time()
                    from core.api_call_tracker import api_call_tracker
                    api_call_tracker.record_call('spotify', endpoint='get_playlist_tracks_page')
@ -1304,8 +1333,9 @@ class SpotifyClient:
                while next_page.get('next'):
                    with _api_call_lock:
                        elapsed = time.time() - _last_api_call_time
-                        if elapsed < MIN_API_INTERVAL:
-                            time.sleep(MIN_API_INTERVAL - elapsed)
+                        _pi = _get_min_api_interval()
+                        if elapsed < _pi:
+                            time.sleep(_pi - elapsed)
                        globals()['_last_api_call_time'] = time.time()
                    from core.api_call_tracker import api_call_tracker
                    api_call_tracker.record_call('spotify', endpoint='get_album_tracks_page')
@ -1351,9 +1381,11 @@ class SpotifyClient:
            return None
    
    @rate_limited
-    def get_artist_albums(self, artist_id: str, album_type: str = 'album,single', limit: int = 10, skip_cache: bool = False) -> List[Album]:
+    def get_artist_albums(self, artist_id: str, album_type: str = 'album,single', limit: int = 10, skip_cache: bool = False, max_pages: int = 0) -> List[Album]:
        """Get albums by artist ID - falls back to iTunes if Spotify not authenticated.
-        Set skip_cache=True for watchlist scans that need fresh data to detect new releases."""
+        Set skip_cache=True for watchlist scans that need fresh data to detect new releases.
+        Set max_pages to limit pagination (0 = fetch all). Spotify returns newest first,
+        so max_pages=1 is sufficient for new release detection."""
        cache = get_metadata_cache()
        fallback_src = self._fallback_source
        source = fallback_src if self._is_itunes_id(artist_id) else 'spotify'
@ -1373,7 +1405,9 @@ class SpotifyClient:
            try:
                albums = []
                raw_items = []
+                # Spotify caps artist_albums at 10 per page
                results = self.sp.artist_albums(artist_id, album_type=album_type, limit=min(limit, 10))
+                pages_fetched = 1

                while results:
                    for album_data in results['items']:
@ -1381,21 +1415,28 @@ class SpotifyClient:
                        albums.append(album)
                        raw_items.append(album_data)

+                    # Stop if we've hit the page limit (0 = unlimited)
+                    if max_pages and pages_fetched >= max_pages:
+                        break
+
                    # Get next batch if available — throttle pagination to respect rate limits
                    if results['next']:
                        # Enforce same rate limit as decorated calls
                        with _api_call_lock:
                            elapsed = time.time() - _last_api_call_time
-                            if elapsed < MIN_API_INTERVAL:
-                                time.sleep(MIN_API_INTERVAL - elapsed)
+                            _pi = _get_min_api_interval()
+                            if elapsed < _pi:
+                                time.sleep(_pi - elapsed)
                            globals()['_last_api_call_time'] = time.time()
                        from core.api_call_tracker import api_call_tracker
                        api_call_tracker.record_call('spotify', endpoint='get_artist_albums_page')
                        results = self.sp.next(results)
+                        pages_fetched += 1
                    else:
                        results = None

-                logger.info(f"Retrieved {len(albums)} albums for artist {artist_id}")
+                logger.info(f"Retrieved {len(albums)} albums for artist {artist_id}" +
+                            (f" (page limit: {max_pages})" if max_pages else ""))

                # Cache the full artist albums result (wrapped in dict for cache compatibility)
                if raw_items:
--- a/core/watchlist_scanner.py
+++ b/core/watchlist_scanner.py
@ -857,34 +857,41 @@ class WatchlistScanner:
                                If None, uses lookback period setting from database
        """
        try:
-            # Get all artist albums (albums + singles) - this is rate limited in spotify_client
-            logger.debug(f"Fetching discography for artist {spotify_artist_id}")
-            albums = self.spotify_client.get_artist_albums(spotify_artist_id, album_type='album,single', limit=50, skip_cache=True)
-
-            if not albums:
-                logger.warning(f"No albums found for artist {spotify_artist_id}")
-                return []
-
-            # Add small delay after fetching artist discography to be extra safe
-            time.sleep(0.3)  # 300ms breathing room
-
-            # Determine cutoff date for filtering
+            # Determine if we need the full discography or just recent releases.
+            # Spotify returns albums sorted newest-first, so for time-bounded scans
+            # we only need the first page (50 albums) — this cuts API calls by ~90%
+            # for prolific artists (262 albums = 27 calls → 1 call).
+            needs_full_discog = False
            cutoff_timestamp = last_scan_timestamp

-            # If no last scan timestamp, use per-artist lookback or global setting
            if cutoff_timestamp is None:
                if lookback_days is not None:
-                    # Per-artist override
                    cutoff_timestamp = datetime.now(timezone.utc) - timedelta(days=lookback_days)
                    logger.info(f"Using per-artist lookback: {lookback_days} days (cutoff: {cutoff_timestamp})")
                else:
-                    # Global setting
                    lookback_period = self._get_lookback_period_setting()
-                    if lookback_period != 'all':
+                    if lookback_period == 'all':
+                        needs_full_discog = True
+                    else:
                        days = int(lookback_period)
                        cutoff_timestamp = datetime.now(timezone.utc) - timedelta(days=days)
                        logger.info(f"Using global lookback period: {lookback_period} days (cutoff: {cutoff_timestamp})")

+            # Fetch albums — limit pagination unless full discography is needed
+            logger.debug(f"Fetching discography for artist {spotify_artist_id}" +
+                         (" (full)" if needs_full_discog else " (recent only, max 1 page)"))
+            albums = self.spotify_client.get_artist_albums(
+                spotify_artist_id, album_type='album,single', limit=50,
+                skip_cache=True, max_pages=0 if needs_full_discog else 1
+            )
+
+            if not albums:
+                logger.warning(f"No albums found for artist {spotify_artist_id}")
+                return []
+
+            # Add small delay after fetching artist discography to be extra safe
+            time.sleep(0.3)  # 300ms breathing room
+
            # Filter by release date if we have a cutoff timestamp
            if cutoff_timestamp:
                filtered_albums = []
@ -914,25 +921,15 @@ class WatchlistScanner:
            lookback_days: Per-artist override for lookback period (None = use global setting)
        """
        try:
-            # Get all artist albums (albums + singles)
-            # skip_cache for Spotify so watchlist scans always get fresh data
-            logger.debug(f"Fetching discography for artist {artist_id}")
-            _skip = {'skip_cache': True} if hasattr(client, 'sp') else {}
-            albums = client.get_artist_albums(artist_id, album_type='album,single', limit=50, **_skip)
-
-            if not albums:
-                logger.warning(f"No albums found for artist {artist_id}")
-                return []
-
-            # Add small delay after fetching artist discography to be extra safe
-            time.sleep(0.3)  # 300ms breathing room
-
-            # Determine cutoff date for filtering
+            # Determine if we need full discography or just recent releases BEFORE fetching.
+            # Spotify returns albums newest-first, so for time-bounded scans we only need
+            # the first page (50 albums) — cuts API calls by ~90% for prolific artists.
            lookback_period = self._get_lookback_period_setting()
+            needs_full_discog = False

-            # If lookback is 'all', always return everything regardless of scan timestamp
            if lookback_period == 'all':
                cutoff_timestamp = None
+                needs_full_discog = True
            elif last_scan_timestamp is not None:
                cutoff_timestamp = last_scan_timestamp

@ -941,6 +938,7 @@ class WatchlistScanner:
                if rescan_cutoff == 'all':
                    logger.info(f"Lookback period changed to 'all' — returning full discography")
                    cutoff_timestamp = None
+                    needs_full_discog = True
                elif rescan_cutoff is not None:
                    scan_ts = cutoff_timestamp
                    if scan_ts.tzinfo is None:
@ -951,10 +949,30 @@ class WatchlistScanner:
                        logger.info(f"Lookback period change detected — expanding cutoff from {cutoff_timestamp} to {rescan_cutoff}")
                        cutoff_timestamp = rescan_cutoff
            else:
-                # No scan timestamp — use lookback period
-                days = int(lookback_period)
+                # No scan timestamp — first scan, use lookback period
+                if lookback_days is not None:
+                    days = lookback_days
+                else:
+                    days = int(lookback_period)
                cutoff_timestamp = datetime.now(timezone.utc) - timedelta(days=days)
-                logger.info(f"Using lookback period: {lookback_period} days (cutoff: {cutoff_timestamp})")
+                logger.info(f"Using lookback period: {days} days (cutoff: {cutoff_timestamp})")
+
+            # Fetch albums — limit pagination unless full discography is needed
+            logger.debug(f"Fetching discography for artist {artist_id}" +
+                         (" (full)" if needs_full_discog else " (recent only, max 1 page)"))
+            _skip = {'skip_cache': True} if hasattr(client, 'sp') else {}
+            _max_pages = 0 if needs_full_discog else 1
+            # Only pass max_pages to clients that support it (spotify_client)
+            if hasattr(client, 'sp'):
+                _skip['max_pages'] = _max_pages
+            albums = client.get_artist_albums(artist_id, album_type='album,single', limit=50, **_skip)
+
+            if not albums:
+                logger.warning(f"No albums found for artist {artist_id}")
+                return []
+
+            # Add small delay after fetching artist discography to be extra safe
+            time.sleep(0.3)  # 300ms breathing room

            # Filter by release date if we have a cutoff timestamp
            if cutoff_timestamp:
--- a/webui/static/helper.js
+++ b/webui/static/helper.js
@ -3404,6 +3404,7 @@ const WHATS_NEW = {
    '2.2': [
        // Newest features first
        { title: 'Fix Album Folder Splitting',                       desc: 'Collab albums and artist name changes no longer scatter tracks across multiple folders — $albumartist now uses album-level artist consistently' },
+        { title: 'Fix Watchlist Rate Limiting',                      desc: 'Watchlist scans now fetch only newest albums instead of full discography (~90% fewer API calls). Configurable API interval in settings. Better Retry-After header extraction' },
        { title: 'Discogs Integration',                              desc: 'New metadata source — enrichment worker, fallback source, enhanced search tab, watchlist support, cache browser. Genres, styles, labels, bios, ratings from 400+ taxonomy', page: 'dashboard' },
        { title: 'Webhook THEN Action',                              desc: 'Send HTTP POST to any URL when automations complete — integrate with Gotify, Home Assistant, Slack, n8n. Configurable headers and message template', page: 'automations' },
        { title: 'API Rate Monitor',                               desc: 'Real-time speedometer gauges for all enrichment services on the Dashboard. Click any gauge for 24h history chart. Spotify shows per-endpoint breakdown', page: 'dashboard' },