caching for similar artists

pull/64/head
Broque Thomas 6 months ago
parent eece3e97fd
commit 76dca2f045

@ -246,8 +246,17 @@ class WatchlistScanner:
# Update last scan timestamp for this artist
self.update_artist_scan_timestamp(watchlist_artist.spotify_artist_id)
# Fetch and store similar artists for discovery feature
self.update_similar_artists(watchlist_artist)
# Fetch and store similar artists for discovery feature (with caching to avoid over-polling)
try:
# Check if we have fresh similar artists cached (< 30 days old)
if self.database.has_fresh_similar_artists(watchlist_artist.spotify_artist_id, days_threshold=30):
logger.info(f"Similar artists for {watchlist_artist.artist_name} are cached and fresh, skipping fetch")
else:
logger.info(f"Fetching similar artists for {watchlist_artist.artist_name}...")
self.update_similar_artists(watchlist_artist)
logger.info(f"Similar artists updated for {watchlist_artist.artist_name}")
except Exception as similar_error:
logger.warning(f"Failed to update similar artists for {watchlist_artist.artist_name}: {similar_error}")
return ScanResult(
artist_name=watchlist_artist.artist_name,
@ -656,11 +665,21 @@ class WatchlistScanner:
"""
Populate discovery pool with tracks from top similar artists.
Called after watchlist scan completes.
This method now:
- Checks if pool was updated in last 24 hours (prevents over-polling Spotify)
- Appends to existing pool instead of replacing it
- Cleans up tracks older than 365 days (maintains 1 year rolling window)
"""
try:
from datetime import datetime, timedelta
import random
# Check if we should run (prevents over-polling Spotify)
if not self.database.should_populate_discovery_pool(hours_threshold=24):
logger.info("Discovery pool was populated recently (< 24 hours ago). Skipping to avoid over-polling Spotify.")
return
logger.info("Populating discovery pool from similar artists...")
# Get top similar artists across all watchlist (ordered by occurrence_count)
@ -776,15 +795,110 @@ class WatchlistScanner:
logger.warning(f"Error processing artist {similar_artist.similar_artist_name}: {artist_error}")
continue
logger.info(f"Discovery pool population complete: {total_tracks_added} tracks added")
logger.info(f"Discovery pool from similar artists complete: {total_tracks_added} tracks added")
# Rotate discovery pool if needed (maintain 1000-2000 track limit)
self.database.rotate_discovery_pool(max_tracks=2000, remove_count=500)
# Note: Watchlist artist albums are already in discovery pool from the watchlist scan itself
# No need to re-fetch them here to avoid duplicate API calls
# Add tracks from random database albums for extra variety (reduced to 5 to save API calls)
logger.info("Adding tracks from database albums to discovery pool...")
try:
with self.database._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT DISTINCT a.title, ar.name as artist_name
FROM albums_new a
JOIN artists_new ar ON a.artist_id = ar.id
ORDER BY RANDOM()
LIMIT 5
""")
db_albums = cursor.fetchall()
logger.info(f"Processing {len(db_albums)} database albums for discovery pool")
for db_idx, album_row in enumerate(db_albums, 1):
try:
# Search for album on Spotify
query = f"album:{album_row['title']} artist:{album_row['artist_name']}"
search_results = self.spotify_client.search_albums(query, limit=1)
if search_results and len(search_results) > 0:
spotify_album = search_results[0]
album_data = self.spotify_client.get_album(spotify_album.id)
if album_data and 'tracks' in album_data:
tracks = album_data['tracks'].get('items', [])
# Check if new release
is_new = False
try:
release_date_str = album_data.get('release_date', '')
if release_date_str and len(release_date_str) == 10:
release_date = datetime.strptime(release_date_str, "%Y-%m-%d")
days_old = (datetime.now() - release_date).days
is_new = days_old <= 30
except:
pass
for track in tracks:
try:
track_data = {
'spotify_track_id': track['id'],
'spotify_album_id': album_data['id'],
'spotify_artist_id': album_data['artists'][0]['id'] if album_data.get('artists') else '',
'track_name': track['name'],
'artist_name': album_row['artist_name'],
'album_name': album_row['title'],
'album_cover_url': album_data.get('images', [{}])[0].get('url') if album_data.get('images') else None,
'duration_ms': track.get('duration_ms', 0),
'popularity': album_data.get('popularity', 0),
'release_date': album_data.get('release_date', ''),
'is_new_release': is_new,
'track_data_json': track
}
if self.database.add_to_discovery_pool(track_data):
total_tracks_added += 1
except Exception as track_error:
continue
time.sleep(DELAY_BETWEEN_ALBUMS)
except Exception as album_error:
logger.debug(f"Error processing database album {album_row['title']}: {album_error}")
continue
# Rate limit between albums
if db_idx < len(db_albums):
time.sleep(DELAY_BETWEEN_ARTISTS)
except Exception as db_error:
logger.warning(f"Error processing database albums: {db_error}")
logger.info(f"Discovery pool population complete: {total_tracks_added} total tracks added from all sources")
# Clean up tracks older than 365 days (maintain 1 year rolling window)
logger.info("Cleaning up discovery tracks older than 365 days...")
deleted_count = self.database.cleanup_old_discovery_tracks(days_threshold=365)
logger.info(f"Cleaned up {deleted_count} old tracks from discovery pool")
# Get final track count for metadata
with self.database._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) as count FROM discovery_pool")
final_count = cursor.fetchone()['count']
# Update timestamp to mark when pool was last populated
self.database.update_discovery_pool_timestamp(track_count=final_count)
logger.info(f"Discovery pool now contains {final_count} total tracks (built over time)")
# Cache recent albums for discovery page
logger.info("Caching recent albums for discovery page...")
self.cache_discovery_recent_albums()
# Curate playlists for consistent daily experience
logger.info("Curating discovery playlists...")
self.curate_discovery_playlists()
except Exception as e:
logger.error(f"Error populating discovery pool: {e}")
import traceback
@ -902,6 +1016,72 @@ class WatchlistScanner:
import traceback
traceback.print_exc()
def curate_discovery_playlists(self):
"""Curate consistent playlist selections that stay the same until next discovery pool update"""
try:
import random
logger.info("Curating Release Radar playlist...")
# 1. Curate Release Radar - 50 tracks from recent albums
recent_albums = self.database.get_discovery_recent_albums(limit=20)
release_radar_tracks = []
if recent_albums:
# Group albums by artist for variety
albums_by_artist = {}
for album in recent_albums:
artist = album['artist_name']
if artist not in albums_by_artist:
albums_by_artist[artist] = []
albums_by_artist[artist].append(album)
# Get tracks from each album, grouped by artist
artist_tracks = {}
for artist, albums in albums_by_artist.items():
artist_tracks[artist] = []
for album in albums:
try:
album_data = self.spotify_client.get_album(album['album_spotify_id'])
if album_data and 'tracks' in album_data:
for track in album_data['tracks']['items']:
artist_tracks[artist].append(track['id'])
except Exception as e:
continue
# Balance by artist - max 6 tracks per artist
balanced_tracks = []
for artist, tracks in artist_tracks.items():
random.shuffle(tracks)
balanced_tracks.extend(tracks[:6]) # Max 6 per artist
# Shuffle and limit to 50
random.shuffle(balanced_tracks)
release_radar_tracks = balanced_tracks[:50]
self.database.save_curated_playlist('release_radar', release_radar_tracks)
logger.info(f"Release Radar curated: {len(release_radar_tracks)} tracks")
# 2. Curate Discovery Weekly - 50 tracks from full discovery pool
logger.info("Curating Discovery Weekly playlist...")
discovery_tracks = self.database.get_discovery_pool_tracks(limit=1000, new_releases_only=False)
discovery_weekly_tracks = []
if discovery_tracks:
all_track_ids = [track.spotify_track_id for track in discovery_tracks]
random.shuffle(all_track_ids)
discovery_weekly_tracks = all_track_ids[:50]
self.database.save_curated_playlist('discovery_weekly', discovery_weekly_tracks)
logger.info(f"Discovery Weekly curated: {len(discovery_weekly_tracks)} tracks")
logger.info("Playlist curation complete")
except Exception as e:
logger.error(f"Error curating discovery playlists: {e}")
import traceback
traceback.print_exc()
# Singleton instance
_watchlist_scanner_instance = None

@ -488,6 +488,26 @@ class MusicDatabase:
)
""")
# Discovery Curated Playlists - store curated track selections for consistency
cursor.execute("""
CREATE TABLE IF NOT EXISTS discovery_curated_playlists (
id INTEGER PRIMARY KEY AUTOINCREMENT,
playlist_type TEXT NOT NULL UNIQUE,
track_ids_json TEXT NOT NULL,
curated_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Discovery Pool Metadata - track when pool was last populated to prevent over-polling
cursor.execute("""
CREATE TABLE IF NOT EXISTS discovery_pool_metadata (
id INTEGER PRIMARY KEY CHECK (id = 1),
last_populated_timestamp TIMESTAMP NOT NULL,
track_count INTEGER DEFAULT 0,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create indexes for performance
cursor.execute("CREATE INDEX IF NOT EXISTS idx_similar_artists_source ON similar_artists (source_artist_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_similar_artists_spotify ON similar_artists (similar_artist_spotify_id)")
@ -2490,6 +2510,37 @@ class MusicDatabase:
logger.error(f"Error getting similar artists: {e}")
return []
def has_fresh_similar_artists(self, source_artist_id: str, days_threshold: int = 30) -> bool:
"""
Check if we have cached similar artists that are still fresh (< days_threshold old).
Returns True if we have recent data, False if data is stale or missing.
"""
try:
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT COUNT(*) as count, MAX(last_updated) as last_updated
FROM similar_artists
WHERE source_artist_id = ?
""", (source_artist_id,))
row = cursor.fetchone()
if not row or row['count'] == 0:
# No similar artists cached
return False
# Check if data is fresh
last_updated = datetime.fromisoformat(row['last_updated'])
days_since_update = (datetime.now() - last_updated).total_seconds() / 86400 # seconds to days
return days_since_update < days_threshold
except Exception as e:
logger.error(f"Error checking similar artists freshness: {e}")
return False # Default to re-fetching on error
def get_top_similar_artists(self, limit: int = 50) -> List[SimilarArtist]:
"""Get top similar artists across all watchlist artists, ordered by occurrence count"""
try:
@ -2701,6 +2752,106 @@ class MusicDatabase:
logger.error(f"Error clearing discovery recent albums: {e}")
return False
def save_curated_playlist(self, playlist_type: str, track_ids: List[str]) -> bool:
"""Save a curated playlist selection (stays same until next discovery pool update)"""
try:
import json
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
INSERT OR REPLACE INTO discovery_curated_playlists
(playlist_type, track_ids_json, curated_date)
VALUES (?, ?, CURRENT_TIMESTAMP)
""", (playlist_type, json.dumps(track_ids)))
conn.commit()
return True
except Exception as e:
logger.error(f"Error saving curated playlist {playlist_type}: {e}")
return False
def get_curated_playlist(self, playlist_type: str) -> Optional[List[str]]:
"""Get saved curated playlist track IDs"""
try:
import json
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT track_ids_json FROM discovery_curated_playlists
WHERE playlist_type = ?
""", (playlist_type,))
row = cursor.fetchone()
if row:
return json.loads(row['track_ids_json'])
return None
except Exception as e:
logger.error(f"Error getting curated playlist {playlist_type}: {e}")
return None
def should_populate_discovery_pool(self, hours_threshold: int = 24) -> bool:
"""Check if discovery pool should be populated (hasn't been updated in X hours)"""
try:
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT last_populated_timestamp
FROM discovery_pool_metadata
WHERE id = 1
""")
row = cursor.fetchone()
if not row:
# Never populated before
return True
last_populated = datetime.fromisoformat(row['last_populated_timestamp'])
hours_since_update = (datetime.now() - last_populated).total_seconds() / 3600
return hours_since_update >= hours_threshold
except Exception as e:
logger.error(f"Error checking discovery pool timestamp: {e}")
return True # Default to allowing population on error
def update_discovery_pool_timestamp(self, track_count: int) -> bool:
"""Update the last populated timestamp and track count"""
try:
with self._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
INSERT OR REPLACE INTO discovery_pool_metadata
(id, last_populated_timestamp, track_count, updated_at)
VALUES (1, ?, ?, CURRENT_TIMESTAMP)
""", (datetime.now().isoformat(), track_count))
conn.commit()
return True
except Exception as e:
logger.error(f"Error updating discovery pool timestamp: {e}")
return False
def cleanup_old_discovery_tracks(self, days_threshold: int = 365) -> int:
"""Remove tracks from discovery pool older than X days. Returns count of deleted tracks."""
try:
with self._get_connection() as conn:
cursor = conn.cursor()
# Delete tracks older than threshold
cursor.execute("""
DELETE FROM discovery_pool
WHERE added_date < datetime('now', '-' || ? || ' days')
""", (days_threshold,))
deleted_count = cursor.rowcount
conn.commit()
if deleted_count > 0:
logger.info(f"Cleaned up {deleted_count} discovery tracks older than {days_threshold} days")
return deleted_count
except Exception as e:
logger.error(f"Error cleaning up old discovery tracks: {e}")
return 0
def add_recent_release(self, watchlist_artist_id: int, album_data: Dict[str, Any]) -> bool:
"""Add a recent release to the recent_releases table"""
try:

@ -14466,62 +14466,39 @@ def get_discover_recent_releases():
@app.route('/api/discover/release-radar', methods=['GET'])
def get_discover_release_radar():
"""Get release radar playlist - 50 tracks randomly selected from all recent albums"""
"""Get release radar playlist - curated selection that stays consistent until next update"""
try:
import random
database = get_database()
if not spotify_client or not spotify_client.is_authenticated():
return jsonify({"success": True, "tracks": []})
# Get all recent albums from cache
recent_albums = database.get_discovery_recent_albums(limit=20)
if not recent_albums:
return jsonify({"success": True, "tracks": []})
all_tracks = []
# Get tracks from each recent album
for album in recent_albums:
try:
# Get album tracks from Spotify
album_data = spotify_client.get_album(album['album_spotify_id'])
if album_data and 'tracks' in album_data:
for track in album_data['tracks']['items']:
all_tracks.append({
"spotify_track_id": track['id'],
"track_name": track['name'],
"artist_name": album['artist_name'],
"album_name": album['album_name'],
"album_cover_url": album['album_cover_url'],
"duration_ms": track.get('duration_ms', 0),
"track_data_json": track
})
except Exception as e:
print(f"Error getting tracks for album {album['album_name']}: {e}")
continue
# Group tracks by artist to ensure variety
tracks_by_artist = {}
for track in all_tracks:
artist_name = track['artist_name']
if artist_name not in tracks_by_artist:
tracks_by_artist[artist_name] = []
tracks_by_artist[artist_name].append(track)
# Limit each artist to max 6 tracks for variety
balanced_tracks = []
for artist_name, tracks in tracks_by_artist.items():
random.shuffle(tracks)
balanced_tracks.extend(tracks[:6]) # Max 6 tracks per artist
# Try to get curated playlist first
curated_track_ids = database.get_curated_playlist('release_radar')
if curated_track_ids:
# Use curated selection - fetch track data from discovery pool
discovery_tracks = database.get_discovery_pool_tracks(limit=5000, new_releases_only=False)
tracks_by_id = {track.spotify_track_id: track for track in discovery_tracks}
selected_tracks = []
for track_id in curated_track_ids:
if track_id in tracks_by_id:
track = tracks_by_id[track_id]
selected_tracks.append({
"spotify_track_id": track.spotify_track_id,
"track_name": track.track_name,
"artist_name": track.artist_name,
"album_name": track.album_name,
"album_cover_url": track.album_cover_url,
"duration_ms": track.duration_ms,
"track_data_json": track.track_data_json
})
# Randomly select up to 50 tracks from balanced pool
random.shuffle(balanced_tracks)
selected_tracks = balanced_tracks[:50]
return jsonify({"success": True, "tracks": selected_tracks})
return jsonify({"success": True, "tracks": selected_tracks})
# Fallback: no curated playlist exists (shouldn't happen after first scan)
return jsonify({"success": True, "tracks": []})
except Exception as e:
print(f"Error getting release radar: {e}")
@ -14531,105 +14508,39 @@ def get_discover_release_radar():
@app.route('/api/discover/weekly', methods=['GET'])
def get_discover_weekly():
"""Get discovery weekly playlist - 50 tracks from similar artists, watchlist artists, and database albums"""
"""Get discovery weekly playlist - curated selection that stays consistent until next update"""
try:
import random
database = get_database()
if not spotify_client or not spotify_client.is_authenticated():
return jsonify({"success": True, "tracks": []})
all_tracks = []
# 1. Get tracks from discovery pool (similar artists) - aim for ~30 tracks
discovery_tracks = database.get_discovery_pool_tracks(limit=300, new_releases_only=False)
for track in discovery_tracks:
all_tracks.append({
"spotify_track_id": track.spotify_track_id,
"track_name": track.track_name,
"artist_name": track.artist_name,
"album_name": track.album_name,
"album_cover_url": track.album_cover_url,
"duration_ms": track.duration_ms,
"track_data_json": track.track_data_json
})
# 2. Get tracks from random watchlist artists - aim for ~10 tracks
try:
watchlist_artists = database.get_watchlist_artists()
if watchlist_artists:
random_watchlist = random.sample(watchlist_artists, min(2, len(watchlist_artists)))
for artist in random_watchlist:
try:
albums = spotify_client.get_artist_albums(artist.spotify_artist_id, album_type='album', limit=10)
if albums:
random_album = random.choice(albums)
album_data = spotify_client.get_album(random_album.id)
if album_data and 'tracks' in album_data:
for track in album_data['tracks']['items'][:5]: # 5 tracks per album
all_tracks.append({
"spotify_track_id": track['id'],
"track_name": track['name'],
"artist_name": artist.artist_name,
"album_name": random_album.name,
"album_cover_url": random_album.image_url if hasattr(random_album, 'image_url') else None,
"duration_ms": track.get('duration_ms', 0),
"track_data_json": track
})
except Exception as e:
continue
except Exception as e:
print(f"Error getting watchlist tracks: {e}")
# 3. Get tracks from random database albums - aim for ~10 tracks
try:
# Get random albums from database
with database._get_connection() as conn:
cursor = conn.cursor()
cursor.execute("""
SELECT DISTINCT a.title, ar.name as artist_name
FROM albums_new a
JOIN artists_new ar ON a.artist_id = ar.id
ORDER BY RANDOM()
LIMIT 2
""")
db_albums = cursor.fetchall()
for album_row in db_albums:
try:
# Search for album on Spotify
query = f"album:{album_row['title']} artist:{album_row['artist_name']}"
search_results = spotify_client.search_albums(query, limit=1)
if search_results and len(search_results) > 0:
spotify_album = search_results[0]
album_data = spotify_client.get_album(spotify_album.id)
if album_data and 'tracks' in album_data:
for track in album_data['tracks']['items'][:5]: # 5 tracks per album
all_tracks.append({
"spotify_track_id": track['id'],
"track_name": track['name'],
"artist_name": album_row['artist_name'],
"album_name": album_row['title'],
"album_cover_url": spotify_album.image_url if hasattr(spotify_album, 'image_url') else None,
"duration_ms": track.get('duration_ms', 0),
"track_data_json": track
})
except Exception as e:
continue
except Exception as e:
print(f"Error getting database album tracks: {e}")
# Try to get curated playlist first
curated_track_ids = database.get_curated_playlist('discovery_weekly')
if curated_track_ids:
# Use curated selection - fetch track data from discovery pool
discovery_tracks = database.get_discovery_pool_tracks(limit=5000, new_releases_only=False)
tracks_by_id = {track.spotify_track_id: track for track in discovery_tracks}
selected_tracks = []
for track_id in curated_track_ids:
if track_id in tracks_by_id:
track = tracks_by_id[track_id]
selected_tracks.append({
"spotify_track_id": track.spotify_track_id,
"track_name": track.track_name,
"artist_name": track.artist_name,
"album_name": track.album_name,
"album_cover_url": track.album_cover_url,
"duration_ms": track.duration_ms,
"track_data_json": track.track_data_json
})
# Randomly select 50 tracks from the combined pool
random.shuffle(all_tracks)
selected_tracks = all_tracks[:50]
return jsonify({"success": True, "tracks": selected_tracks})
return jsonify({"success": True, "tracks": selected_tracks})
# Fallback: no curated playlist exists (shouldn't happen after first scan)
return jsonify({"success": True, "tracks": []})
except Exception as e:
print(f"Error getting discovery weekly: {e}")
import traceback
traceback.print_exc()
return jsonify({"success": False, "error": str(e)}), 500
@app.route('/api/metadata/start', methods=['POST'])
@ -17830,28 +17741,4 @@ if __name__ == '__main__':
# Add a test activity to verify the system is working
add_activity_item("🔧", "Debug Test", "Activity feed system test", "Now")
# Populate discovery pool at startup (background task)
def startup_populate_discovery():
"""Populate discovery pool at startup in background"""
try:
print("🎵 Populating discovery pool at startup...")
from core.watchlist_scanner import get_watchlist_scanner
if spotify_client and spotify_client.is_authenticated():
scanner = get_watchlist_scanner(spotify_client)
scanner.populate_discovery_pool()
print("✅ Discovery pool populated successfully")
add_activity_item("🎵", "Discovery Pool", "Discovery data populated successfully", "Now")
else:
print("⚠️ Spotify not authenticated - skipping discovery pool population")
except Exception as e:
print(f"❌ Error populating discovery pool at startup: {e}")
import traceback
traceback.print_exc()
# Run discovery pool population in background thread
import threading
discovery_thread = threading.Thread(target=startup_populate_discovery, daemon=True)
discovery_thread.start()
print("🔧 Discovery pool population started in background...")
app.run(host='0.0.0.0', port=8008, debug=False)

Loading…
Cancel
Save