mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1662 lines
83 KiB
1662 lines
83 KiB
#!/usr/bin/env python3
|
|
|
|
"""
|
|
Watchlist Scanner Service - Monitors watched artists for new releases
|
|
"""
|
|
|
|
from typing import List, Dict, Any, Optional
|
|
from datetime import datetime, timezone, timedelta
|
|
from dataclasses import dataclass
|
|
import re
|
|
import time
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from database.music_database import get_database, WatchlistArtist
|
|
from core.spotify_client import SpotifyClient
|
|
from core.wishlist_service import get_wishlist_service
|
|
from core.matching_engine import MusicMatchingEngine
|
|
from utils.logging_config import get_logger
|
|
|
|
logger = get_logger("watchlist_scanner")
|
|
|
|
# Rate limiting constants for watchlist operations
|
|
DELAY_BETWEEN_ARTISTS = 2.0 # 2 seconds between different artists
|
|
DELAY_BETWEEN_ALBUMS = 0.5 # 500ms between albums for same artist
|
|
DELAY_BETWEEN_API_BATCHES = 1.0 # 1 second between API batch operations
|
|
|
|
def clean_track_name_for_search(track_name):
|
|
"""
|
|
Intelligently cleans a track name for searching by removing noise while preserving important version information.
|
|
Removes: (feat. Artist), (Explicit), (Clean), etc.
|
|
Keeps: (Extended Version), (Live), (Acoustic), (Remix), etc.
|
|
"""
|
|
if not track_name or not isinstance(track_name, str):
|
|
return track_name
|
|
|
|
cleaned_name = track_name
|
|
|
|
# Define patterns to REMOVE (noise that doesn't affect track identity)
|
|
remove_patterns = [
|
|
r'\s*\(explicit\)', # (Explicit)
|
|
r'\s*\(clean\)', # (Clean)
|
|
r'\s*\(radio\s*edit\)', # (Radio Edit)
|
|
r'\s*\(radio\s*version\)', # (Radio Version)
|
|
r'\s*\(feat\.?\s*[^)]+\)', # (feat. Artist) or (ft. Artist)
|
|
r'\s*\(ft\.?\s*[^)]+\)', # (ft Artist)
|
|
r'\s*\(featuring\s*[^)]+\)', # (featuring Artist)
|
|
r'\s*\(with\s*[^)]+\)', # (with Artist)
|
|
r'\s*\[[^\]]*explicit[^\]]*\]', # [Explicit] in brackets
|
|
r'\s*\[[^\]]*clean[^\]]*\]', # [Clean] in brackets
|
|
]
|
|
|
|
# Apply removal patterns
|
|
for pattern in remove_patterns:
|
|
cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE).strip()
|
|
|
|
# PRESERVE important version information (do NOT remove these)
|
|
# These patterns are intentionally NOT in the remove list:
|
|
# - (Extended Version), (Extended), (Long Version)
|
|
# - (Live), (Live Version), (Concert)
|
|
# - (Acoustic), (Acoustic Version)
|
|
# - (Remix), (Club Mix), (Dance Mix)
|
|
# - (Remastered), (Remaster)
|
|
# - (Demo), (Studio Version)
|
|
# - (Instrumental)
|
|
# - Album/year info like (2023), (Deluxe Edition)
|
|
|
|
# If cleaning results in an empty string, return the original track name
|
|
if not cleaned_name.strip():
|
|
return track_name
|
|
|
|
# Log cleaning if significant changes were made
|
|
if cleaned_name != track_name:
|
|
logger.debug(f"🧹 Intelligent track cleaning: '{track_name}' -> '{cleaned_name}'")
|
|
|
|
return cleaned_name
|
|
|
|
@dataclass
|
|
class ScanResult:
|
|
"""Result of scanning a single artist"""
|
|
artist_name: str
|
|
spotify_artist_id: str
|
|
albums_checked: int
|
|
new_tracks_found: int
|
|
tracks_added_to_wishlist: int
|
|
success: bool
|
|
error_message: Optional[str] = None
|
|
|
|
class WatchlistScanner:
|
|
"""Service for scanning watched artists for new releases"""
|
|
|
|
def __init__(self, spotify_client: SpotifyClient, database_path: str = "database/music_library.db"):
|
|
self.spotify_client = spotify_client
|
|
self.database_path = database_path
|
|
self._database = None
|
|
self._wishlist_service = None
|
|
self._matching_engine = None
|
|
|
|
@property
|
|
def database(self):
|
|
"""Get database instance (lazy loading)"""
|
|
if self._database is None:
|
|
self._database = get_database(self.database_path)
|
|
return self._database
|
|
|
|
@property
|
|
def wishlist_service(self):
|
|
"""Get wishlist service instance (lazy loading)"""
|
|
if self._wishlist_service is None:
|
|
self._wishlist_service = get_wishlist_service()
|
|
return self._wishlist_service
|
|
|
|
@property
|
|
def matching_engine(self):
|
|
"""Get matching engine instance (lazy loading)"""
|
|
if self._matching_engine is None:
|
|
self._matching_engine = MusicMatchingEngine()
|
|
return self._matching_engine
|
|
|
|
def scan_all_watchlist_artists(self) -> List[ScanResult]:
|
|
"""
|
|
Scan artists in the watchlist for new releases.
|
|
|
|
OPTIMIZED: Scans up to 50 artists per run using smart selection:
|
|
- Priority: Artists not scanned in 7+ days (guaranteed)
|
|
- Remainder: Random selection from other artists
|
|
|
|
This reduces API calls while ensuring all artists scanned at least weekly.
|
|
Only checks releases after their last scan timestamp.
|
|
"""
|
|
logger.info("Starting watchlist scan")
|
|
|
|
try:
|
|
from datetime import datetime, timedelta
|
|
import random
|
|
|
|
# Get all watchlist artists
|
|
all_watchlist_artists = self.database.get_watchlist_artists()
|
|
if not all_watchlist_artists:
|
|
logger.info("No artists in watchlist to scan")
|
|
return []
|
|
|
|
logger.info(f"Found {len(all_watchlist_artists)} total artists in watchlist")
|
|
|
|
# OPTIMIZATION: Select up to 50 artists to scan
|
|
# 1. Must scan: Artists not scanned in 7+ days (or never scanned)
|
|
seven_days_ago = datetime.now() - timedelta(days=7)
|
|
must_scan = []
|
|
can_skip = []
|
|
|
|
for artist in all_watchlist_artists:
|
|
if artist.last_scan_timestamp is None:
|
|
# Never scanned - must scan
|
|
must_scan.append(artist)
|
|
elif artist.last_scan_timestamp < seven_days_ago:
|
|
# Not scanned in 7+ days - must scan
|
|
must_scan.append(artist)
|
|
else:
|
|
# Scanned recently - can skip (but might randomly select)
|
|
can_skip.append(artist)
|
|
|
|
logger.info(f"Artists requiring scan (not scanned in 7+ days): {len(must_scan)}")
|
|
logger.info(f"Artists scanned recently (< 7 days): {len(can_skip)}")
|
|
|
|
# 2. Fill remaining slots (up to 50 total) with random selection
|
|
max_artists_per_scan = 50
|
|
artists_to_scan = must_scan.copy()
|
|
|
|
remaining_slots = max_artists_per_scan - len(must_scan)
|
|
if remaining_slots > 0 and can_skip:
|
|
# Randomly sample from recently-scanned artists
|
|
random_sample_size = min(remaining_slots, len(can_skip))
|
|
random_selection = random.sample(can_skip, random_sample_size)
|
|
artists_to_scan.extend(random_selection)
|
|
logger.info(f"Additionally scanning {len(random_selection)} randomly selected artists")
|
|
|
|
# Shuffle to avoid always scanning same order
|
|
random.shuffle(artists_to_scan)
|
|
|
|
logger.info(f"Total artists to scan this run: {len(artists_to_scan)}")
|
|
if len(all_watchlist_artists) > max_artists_per_scan:
|
|
logger.info(f"Skipping {len(all_watchlist_artists) - len(artists_to_scan)} artists (will be scanned in future runs)")
|
|
|
|
watchlist_artists = artists_to_scan
|
|
|
|
scan_results = []
|
|
for i, artist in enumerate(watchlist_artists):
|
|
try:
|
|
result = self.scan_artist(artist)
|
|
scan_results.append(result)
|
|
|
|
if result.success:
|
|
logger.info(f"✅ Scanned {artist.artist_name}: {result.new_tracks_found} new tracks found")
|
|
else:
|
|
logger.warning(f"❌ Failed to scan {artist.artist_name}: {result.error_message}")
|
|
|
|
# Rate limiting: Add delay between artists to avoid hitting Spotify API limits
|
|
# This is critical to prevent getting banned for 6+ hours
|
|
if i < len(watchlist_artists) - 1: # Don't delay after the last artist
|
|
logger.debug(f"Rate limiting: waiting {DELAY_BETWEEN_ARTISTS}s before scanning next artist")
|
|
time.sleep(DELAY_BETWEEN_ARTISTS)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error scanning artist {artist.artist_name}: {e}")
|
|
scan_results.append(ScanResult(
|
|
artist_name=artist.artist_name,
|
|
spotify_artist_id=artist.spotify_artist_id,
|
|
albums_checked=0,
|
|
new_tracks_found=0,
|
|
tracks_added_to_wishlist=0,
|
|
success=False,
|
|
error_message=str(e)
|
|
))
|
|
|
|
# Log summary
|
|
successful_scans = [r for r in scan_results if r.success]
|
|
total_new_tracks = sum(r.new_tracks_found for r in successful_scans)
|
|
total_added_to_wishlist = sum(r.tracks_added_to_wishlist for r in successful_scans)
|
|
|
|
logger.info(f"Watchlist scan complete: {len(successful_scans)}/{len(scan_results)} artists scanned successfully")
|
|
logger.info(f"Found {total_new_tracks} new tracks, added {total_added_to_wishlist} to wishlist")
|
|
|
|
# Populate discovery pool with tracks from similar artists
|
|
logger.info("Starting discovery pool population...")
|
|
self.populate_discovery_pool()
|
|
|
|
# Populate seasonal content (runs independently with its own threshold)
|
|
logger.info("Updating seasonal content...")
|
|
self._populate_seasonal_content()
|
|
|
|
return scan_results
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during watchlist scan: {e}")
|
|
return []
|
|
|
|
def scan_artist(self, watchlist_artist: WatchlistArtist) -> ScanResult:
|
|
"""
|
|
Scan a single artist for new releases.
|
|
Only checks releases after the last scan timestamp.
|
|
"""
|
|
try:
|
|
logger.info(f"Scanning artist: {watchlist_artist.artist_name}")
|
|
|
|
# Update artist image from Spotify (cached for performance)
|
|
try:
|
|
artist_data = self.spotify_client.get_artist(watchlist_artist.spotify_artist_id)
|
|
if artist_data and 'images' in artist_data and artist_data['images']:
|
|
# Get medium-sized image (usually the second one, or first if only one)
|
|
image_url = None
|
|
if len(artist_data['images']) > 1:
|
|
image_url = artist_data['images'][1]['url']
|
|
else:
|
|
image_url = artist_data['images'][0]['url']
|
|
|
|
# Update in database
|
|
if image_url:
|
|
self.database.update_watchlist_artist_image(watchlist_artist.spotify_artist_id, image_url)
|
|
logger.info(f"Updated artist image for {watchlist_artist.artist_name}")
|
|
else:
|
|
logger.warning(f"No image URL found for {watchlist_artist.artist_name}")
|
|
else:
|
|
logger.warning(f"No images in Spotify data for {watchlist_artist.artist_name}")
|
|
except Exception as img_error:
|
|
logger.warning(f"Could not update artist image for {watchlist_artist.artist_name}: {img_error}")
|
|
|
|
# Get artist discography from Spotify
|
|
albums = self.get_artist_discography(watchlist_artist.spotify_artist_id, watchlist_artist.last_scan_timestamp)
|
|
|
|
if albums is None:
|
|
return ScanResult(
|
|
artist_name=watchlist_artist.artist_name,
|
|
spotify_artist_id=watchlist_artist.spotify_artist_id,
|
|
albums_checked=0,
|
|
new_tracks_found=0,
|
|
tracks_added_to_wishlist=0,
|
|
success=False,
|
|
error_message="Failed to get artist discography from Spotify"
|
|
)
|
|
|
|
logger.info(f"Found {len(albums)} albums/singles to check for {watchlist_artist.artist_name}")
|
|
|
|
# Safety check: Limit number of albums to scan to prevent extremely long sessions
|
|
MAX_ALBUMS_PER_ARTIST = 50 # Reasonable limit to prevent API abuse
|
|
if len(albums) > MAX_ALBUMS_PER_ARTIST:
|
|
logger.warning(f"Artist {watchlist_artist.artist_name} has {len(albums)} albums, limiting to {MAX_ALBUMS_PER_ARTIST} most recent")
|
|
albums = albums[:MAX_ALBUMS_PER_ARTIST] # Most recent albums are first
|
|
|
|
# Check each album/single for missing tracks
|
|
new_tracks_found = 0
|
|
tracks_added_to_wishlist = 0
|
|
|
|
for album_index, album in enumerate(albums):
|
|
try:
|
|
# Get full album data with tracks
|
|
logger.info(f"Checking album {album_index + 1}/{len(albums)}: {album.name}")
|
|
album_data = self.spotify_client.get_album(album.id)
|
|
if not album_data or 'tracks' not in album_data or not album_data['tracks'].get('items'):
|
|
continue
|
|
|
|
tracks = album_data['tracks']['items']
|
|
logger.debug(f"Checking album: {album_data.get('name', 'Unknown')} ({len(tracks)} tracks)")
|
|
|
|
# Check if user wants this type of release
|
|
if not self._should_include_release(len(tracks), watchlist_artist):
|
|
release_type = "album" if len(tracks) >= 7 else ("EP" if len(tracks) >= 4 else "single")
|
|
logger.debug(f"Skipping {release_type}: {album_data.get('name', 'Unknown')} - user preference")
|
|
continue
|
|
|
|
# Check each track
|
|
for track in tracks:
|
|
if self.is_track_missing_from_library(track):
|
|
new_tracks_found += 1
|
|
|
|
# Add to wishlist
|
|
if self.add_track_to_wishlist(track, album_data, watchlist_artist):
|
|
tracks_added_to_wishlist += 1
|
|
|
|
# Rate limiting: Add delay between albums to prevent API abuse
|
|
# This is especially important for artists with many albums
|
|
if album_index < len(albums) - 1: # Don't delay after the last album
|
|
logger.debug(f"Rate limiting: waiting {DELAY_BETWEEN_ALBUMS}s before next album")
|
|
time.sleep(DELAY_BETWEEN_ALBUMS)
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error checking album {album.name}: {e}")
|
|
continue
|
|
|
|
# Update last scan timestamp for this artist
|
|
self.update_artist_scan_timestamp(watchlist_artist.spotify_artist_id)
|
|
|
|
# Fetch and store similar artists for discovery feature (with caching to avoid over-polling)
|
|
try:
|
|
# Check if we have fresh similar artists cached (< 30 days old)
|
|
if self.database.has_fresh_similar_artists(watchlist_artist.spotify_artist_id, days_threshold=30):
|
|
logger.info(f"Similar artists for {watchlist_artist.artist_name} are cached and fresh, skipping fetch")
|
|
else:
|
|
logger.info(f"Fetching similar artists for {watchlist_artist.artist_name}...")
|
|
self.update_similar_artists(watchlist_artist)
|
|
logger.info(f"Similar artists updated for {watchlist_artist.artist_name}")
|
|
except Exception as similar_error:
|
|
logger.warning(f"Failed to update similar artists for {watchlist_artist.artist_name}: {similar_error}")
|
|
|
|
return ScanResult(
|
|
artist_name=watchlist_artist.artist_name,
|
|
spotify_artist_id=watchlist_artist.spotify_artist_id,
|
|
albums_checked=len(albums),
|
|
new_tracks_found=new_tracks_found,
|
|
tracks_added_to_wishlist=tracks_added_to_wishlist,
|
|
success=True
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error scanning artist {watchlist_artist.artist_name}: {e}")
|
|
return ScanResult(
|
|
artist_name=watchlist_artist.artist_name,
|
|
spotify_artist_id=watchlist_artist.spotify_artist_id,
|
|
albums_checked=0,
|
|
new_tracks_found=0,
|
|
tracks_added_to_wishlist=0,
|
|
success=False,
|
|
error_message=str(e)
|
|
)
|
|
|
|
def get_artist_discography(self, spotify_artist_id: str, last_scan_timestamp: Optional[datetime] = None) -> Optional[List]:
|
|
"""
|
|
Get artist's discography from Spotify, optionally filtered by release date.
|
|
|
|
Args:
|
|
spotify_artist_id: Spotify artist ID
|
|
last_scan_timestamp: Only return releases after this date (for incremental scans)
|
|
If None, uses lookback period setting from database
|
|
"""
|
|
try:
|
|
# Get all artist albums (albums + singles) - this is rate limited in spotify_client
|
|
logger.debug(f"Fetching discography for artist {spotify_artist_id}")
|
|
albums = self.spotify_client.get_artist_albums(spotify_artist_id, album_type='album,single', limit=50)
|
|
|
|
if not albums:
|
|
logger.warning(f"No albums found for artist {spotify_artist_id}")
|
|
return []
|
|
|
|
# Add small delay after fetching artist discography to be extra safe
|
|
time.sleep(0.3) # 300ms breathing room
|
|
|
|
# Determine cutoff date for filtering
|
|
cutoff_timestamp = last_scan_timestamp
|
|
|
|
# If no last scan timestamp, use lookback period setting
|
|
if cutoff_timestamp is None:
|
|
lookback_period = self._get_lookback_period_setting()
|
|
if lookback_period != 'all':
|
|
# Convert period to days and create cutoff date (use UTC)
|
|
days = int(lookback_period)
|
|
cutoff_timestamp = datetime.now(timezone.utc) - timedelta(days=days)
|
|
logger.info(f"Using lookback period: {lookback_period} days (cutoff: {cutoff_timestamp})")
|
|
|
|
# Filter by release date if we have a cutoff timestamp
|
|
if cutoff_timestamp:
|
|
filtered_albums = []
|
|
for album in albums:
|
|
if self.is_album_after_timestamp(album, cutoff_timestamp):
|
|
filtered_albums.append(album)
|
|
|
|
logger.info(f"Filtered {len(albums)} albums to {len(filtered_albums)} released after {cutoff_timestamp}")
|
|
return filtered_albums
|
|
|
|
# Return all albums if no cutoff (lookback_period = 'all')
|
|
return albums
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting discography for artist {spotify_artist_id}: {e}")
|
|
return None
|
|
|
|
def _get_lookback_period_setting(self) -> str:
|
|
"""
|
|
Get the discovery lookback period setting from database.
|
|
|
|
Returns:
|
|
str: Period value ('7', '30', '90', '180', or 'all')
|
|
"""
|
|
try:
|
|
with self.database._get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT value FROM metadata WHERE key = 'discovery_lookback_period'")
|
|
row = cursor.fetchone()
|
|
|
|
if row:
|
|
return row['value']
|
|
else:
|
|
# Default to 30 days if not set
|
|
return '30'
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error getting lookback period setting, defaulting to 30 days: {e}")
|
|
return '30'
|
|
|
|
def is_album_after_timestamp(self, album, timestamp: datetime) -> bool:
|
|
"""Check if album was released after the given timestamp"""
|
|
try:
|
|
if not album.release_date:
|
|
return True # Include albums with unknown release dates to be safe
|
|
|
|
# Parse release date - Spotify provides different precisions
|
|
release_date_str = album.release_date
|
|
|
|
# Handle different date formats
|
|
if len(release_date_str) == 4: # Year only (e.g., "2023")
|
|
album_date = datetime(int(release_date_str), 1, 1, tzinfo=timezone.utc)
|
|
elif len(release_date_str) == 7: # Year-month (e.g., "2023-10")
|
|
year, month = release_date_str.split('-')
|
|
album_date = datetime(int(year), int(month), 1, tzinfo=timezone.utc)
|
|
elif len(release_date_str) == 10: # Full date (e.g., "2023-10-15")
|
|
album_date = datetime.strptime(release_date_str, "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
|
else:
|
|
logger.warning(f"Unknown release date format: {release_date_str}")
|
|
return True # Include if we can't parse
|
|
|
|
# Ensure timestamp has timezone info
|
|
if timestamp.tzinfo is None:
|
|
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
|
|
|
return album_date > timestamp
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error comparing album date {album.release_date} with timestamp {timestamp}: {e}")
|
|
return True # Include if we can't determine
|
|
|
|
def _should_include_release(self, track_count: int, watchlist_artist: WatchlistArtist) -> bool:
|
|
"""
|
|
Check if a release should be included based on user's preferences.
|
|
|
|
Categorization:
|
|
- Singles: 1-3 tracks
|
|
- EPs: 4-6 tracks
|
|
- Albums: 7+ tracks
|
|
|
|
Args:
|
|
track_count: Number of tracks in the release
|
|
watchlist_artist: WatchlistArtist object with user preferences
|
|
|
|
Returns:
|
|
True if release should be included, False if should be skipped
|
|
"""
|
|
try:
|
|
# Default to including everything if preferences aren't set (backwards compatibility)
|
|
include_albums = getattr(watchlist_artist, 'include_albums', True)
|
|
include_eps = getattr(watchlist_artist, 'include_eps', True)
|
|
include_singles = getattr(watchlist_artist, 'include_singles', True)
|
|
|
|
# Determine release type based on track count
|
|
if track_count >= 7:
|
|
# This is an album
|
|
return include_albums
|
|
elif track_count >= 4:
|
|
# This is an EP (4-6 tracks)
|
|
return include_eps
|
|
else:
|
|
# This is a single (1-3 tracks)
|
|
return include_singles
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error checking release inclusion: {e}")
|
|
return True # Default to including on error
|
|
|
|
def is_track_missing_from_library(self, track) -> bool:
|
|
"""
|
|
Check if a track is missing from the local Plex library.
|
|
Uses the same matching logic as the download missing tracks modals.
|
|
"""
|
|
try:
|
|
# Handle both dict and object track formats
|
|
if isinstance(track, dict):
|
|
original_title = track.get('name', 'Unknown')
|
|
track_artists = track.get('artists', [])
|
|
artists_to_search = [artist.get('name', 'Unknown') for artist in track_artists] if track_artists else ["Unknown"]
|
|
else:
|
|
original_title = track.name
|
|
artists_to_search = [artist.name for artist in track.artists] if track.artists else ["Unknown"]
|
|
|
|
# Generate title variations (same logic as sync page)
|
|
title_variations = [original_title]
|
|
|
|
# Only add cleaned version if it removes clear noise
|
|
cleaned_for_search = clean_track_name_for_search(original_title)
|
|
if cleaned_for_search.lower() != original_title.lower():
|
|
title_variations.append(cleaned_for_search)
|
|
|
|
# Use matching engine's conservative clean_title
|
|
base_title = self.matching_engine.clean_title(original_title)
|
|
if base_title.lower() not in [t.lower() for t in title_variations]:
|
|
title_variations.append(base_title)
|
|
|
|
unique_title_variations = list(dict.fromkeys(title_variations))
|
|
|
|
# Search for each artist with each title variation
|
|
|
|
for artist_name in artists_to_search:
|
|
for query_title in unique_title_variations:
|
|
# Use same database check as modals with server awareness
|
|
from config.settings import config_manager
|
|
active_server = config_manager.get_active_media_server()
|
|
db_track, confidence = self.database.check_track_exists(query_title, artist_name, confidence_threshold=0.7, server_source=active_server)
|
|
|
|
if db_track and confidence >= 0.7:
|
|
logger.debug(f"✔️ Track found in library: '{original_title}' by '{artist_name}' (confidence: {confidence:.2f})")
|
|
return False # Track exists in library
|
|
|
|
# No match found with any variation or artist
|
|
logger.info(f"❌ Track missing from library: '{original_title}' by '{artists_to_search[0] if artists_to_search else 'Unknown'}' - adding to wishlist")
|
|
return True # Track is missing
|
|
|
|
except Exception as e:
|
|
# Handle both dict and object track formats for error logging
|
|
track_name = track.get('name', 'Unknown') if isinstance(track, dict) else getattr(track, 'name', 'Unknown')
|
|
logger.warning(f"Error checking if track exists: {track_name}: {e}")
|
|
return True # Assume missing if we can't check
|
|
|
|
def add_track_to_wishlist(self, track, album, watchlist_artist: WatchlistArtist) -> bool:
|
|
"""Add a missing track to the wishlist"""
|
|
try:
|
|
# Handle both dict and object track/album formats
|
|
if isinstance(track, dict):
|
|
track_id = track.get('id', '')
|
|
track_name = track.get('name', 'Unknown')
|
|
track_artists = track.get('artists', [])
|
|
track_duration = track.get('duration_ms', 0)
|
|
track_explicit = track.get('explicit', False)
|
|
track_external_urls = track.get('external_urls', {})
|
|
track_popularity = track.get('popularity', 0)
|
|
track_preview_url = track.get('preview_url', None)
|
|
track_number = track.get('track_number', 1)
|
|
track_uri = track.get('uri', '')
|
|
else:
|
|
track_id = track.id
|
|
track_name = track.name
|
|
track_artists = [{'name': artist.name, 'id': artist.id} for artist in track.artists]
|
|
track_duration = getattr(track, 'duration_ms', 0)
|
|
track_explicit = getattr(track, 'explicit', False)
|
|
track_external_urls = getattr(track, 'external_urls', {})
|
|
track_popularity = getattr(track, 'popularity', 0)
|
|
track_preview_url = getattr(track, 'preview_url', None)
|
|
track_number = getattr(track, 'track_number', 1)
|
|
track_uri = getattr(track, 'uri', '')
|
|
|
|
if isinstance(album, dict):
|
|
album_name = album.get('name', 'Unknown')
|
|
album_id = album.get('id', '')
|
|
album_release_date = album.get('release_date', '')
|
|
album_images = album.get('images', [])
|
|
album_type = album.get('album_type', 'album') # 'album', 'single', or 'ep'
|
|
total_tracks = album.get('total_tracks', 0)
|
|
else:
|
|
album_name = album.name
|
|
album_id = album.id
|
|
album_release_date = album.release_date
|
|
album_images = album.images if hasattr(album, 'images') else []
|
|
album_type = album.album_type if hasattr(album, 'album_type') else 'album'
|
|
total_tracks = album.total_tracks if hasattr(album, 'total_tracks') else 0
|
|
|
|
# Create Spotify track data structure
|
|
spotify_track_data = {
|
|
'id': track_id,
|
|
'name': track_name,
|
|
'artists': track_artists,
|
|
'album': {
|
|
'name': album_name,
|
|
'id': album_id,
|
|
'release_date': album_release_date,
|
|
'images': album_images,
|
|
'album_type': album_type, # Store album type for category filtering
|
|
'total_tracks': total_tracks # Store track count for accurate categorization
|
|
},
|
|
'duration_ms': track_duration,
|
|
'explicit': track_explicit,
|
|
'external_urls': track_external_urls,
|
|
'popularity': track_popularity,
|
|
'preview_url': track_preview_url,
|
|
'track_number': track_number,
|
|
'uri': track_uri,
|
|
'is_local': False
|
|
}
|
|
|
|
# Add to wishlist with watchlist context
|
|
success = self.database.add_to_wishlist(
|
|
spotify_track_data=spotify_track_data,
|
|
failure_reason="Missing from library (found by watchlist scan)",
|
|
source_type="watchlist",
|
|
source_info={
|
|
'watchlist_artist_name': watchlist_artist.artist_name,
|
|
'watchlist_artist_id': watchlist_artist.spotify_artist_id,
|
|
'album_name': album_name,
|
|
'scan_timestamp': datetime.now().isoformat()
|
|
}
|
|
)
|
|
|
|
if success:
|
|
first_artist = track_artists[0].get('name', 'Unknown') if track_artists else 'Unknown'
|
|
logger.debug(f"Added track to wishlist: {track_name} by {first_artist}")
|
|
else:
|
|
logger.warning(f"Failed to add track to wishlist: {track_name}")
|
|
|
|
return success
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error adding track to wishlist: {track_name}: {e}")
|
|
return False
|
|
|
|
def update_artist_scan_timestamp(self, spotify_artist_id: str) -> bool:
|
|
"""Update the last scan timestamp for an artist"""
|
|
try:
|
|
with self.database._get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("""
|
|
UPDATE watchlist_artists
|
|
SET last_scan_timestamp = CURRENT_TIMESTAMP, updated_at = CURRENT_TIMESTAMP
|
|
WHERE spotify_artist_id = ?
|
|
""", (spotify_artist_id,))
|
|
|
|
conn.commit()
|
|
|
|
if cursor.rowcount > 0:
|
|
logger.debug(f"Updated scan timestamp for artist {spotify_artist_id}")
|
|
return True
|
|
else:
|
|
logger.warning(f"No artist found with Spotify ID {spotify_artist_id}")
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating scan timestamp for artist {spotify_artist_id}: {e}")
|
|
return False
|
|
|
|
def _fetch_similar_artists_from_musicmap(self, artist_name: str, limit: int = 20) -> List[Dict[str, Any]]:
|
|
"""
|
|
Fetch similar artists from MusicMap and match them to Spotify.
|
|
|
|
Args:
|
|
artist_name: The artist name to find similar artists for
|
|
limit: Maximum number of similar artists to return (default: 20)
|
|
|
|
Returns:
|
|
List of matched artist dictionaries with Spotify data
|
|
"""
|
|
try:
|
|
logger.info(f"Fetching similar artists from MusicMap for: {artist_name}")
|
|
|
|
# Construct MusicMap URL
|
|
url_artist = artist_name.lower().replace(' ', '+')
|
|
musicmap_url = f'https://www.music-map.com/{url_artist}'
|
|
|
|
# Set headers to mimic a browser
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
}
|
|
|
|
# Fetch MusicMap page
|
|
response = requests.get(musicmap_url, headers=headers, timeout=10)
|
|
response.raise_for_status()
|
|
|
|
# Parse HTML
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
gnod_map = soup.find(id='gnodMap')
|
|
|
|
if not gnod_map:
|
|
logger.warning(f"Could not find artist map on MusicMap for {artist_name}")
|
|
return []
|
|
|
|
# Extract similar artist names
|
|
all_anchors = gnod_map.find_all('a')
|
|
searched_artist_lower = artist_name.lower().strip()
|
|
|
|
similar_artist_names = []
|
|
for anchor in all_anchors:
|
|
artist_text = anchor.get_text(strip=True)
|
|
|
|
# Skip if this is the searched artist
|
|
if artist_text.lower() == searched_artist_lower:
|
|
continue
|
|
|
|
similar_artist_names.append(artist_text)
|
|
|
|
logger.info(f"Found {len(similar_artist_names)} similar artists from MusicMap")
|
|
|
|
# Get the searched artist's Spotify ID to exclude them
|
|
searched_artist_id = None
|
|
try:
|
|
searched_results = self.spotify_client.search_artists(artist_name, limit=1)
|
|
if searched_results and len(searched_results) > 0:
|
|
searched_artist_id = searched_results[0].id
|
|
except Exception as e:
|
|
logger.warning(f"Could not get searched artist ID: {e}")
|
|
|
|
# Match each artist to Spotify
|
|
matched_artists = []
|
|
seen_artist_ids = set() # Track seen artist IDs to prevent duplicates
|
|
|
|
for artist_name_to_match in similar_artist_names[:limit]:
|
|
try:
|
|
# Search Spotify for the artist
|
|
results = self.spotify_client.search_artists(artist_name_to_match, limit=1)
|
|
|
|
if results and len(results) > 0:
|
|
spotify_artist = results[0]
|
|
|
|
# Skip if this is the searched artist
|
|
if spotify_artist.id == searched_artist_id:
|
|
continue
|
|
|
|
# Skip if we've already seen this artist ID (deduplication)
|
|
if spotify_artist.id in seen_artist_ids:
|
|
continue
|
|
|
|
seen_artist_ids.add(spotify_artist.id)
|
|
|
|
matched_artists.append({
|
|
'id': spotify_artist.id,
|
|
'name': spotify_artist.name,
|
|
'image_url': spotify_artist.image_url if hasattr(spotify_artist, 'image_url') else None,
|
|
'genres': spotify_artist.genres if hasattr(spotify_artist, 'genres') else [],
|
|
'popularity': spotify_artist.popularity if hasattr(spotify_artist, 'popularity') else 0
|
|
})
|
|
|
|
logger.debug(f" Matched: {spotify_artist.name}")
|
|
|
|
except Exception as match_error:
|
|
logger.debug(f"Error matching {artist_name_to_match}: {match_error}")
|
|
continue
|
|
|
|
logger.info(f"Matched {len(matched_artists)} similar artists to Spotify")
|
|
return matched_artists
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
logger.error(f"Error fetching from MusicMap: {e}")
|
|
return []
|
|
except Exception as e:
|
|
logger.error(f"Error fetching similar artists from MusicMap: {e}")
|
|
return []
|
|
|
|
def update_similar_artists(self, watchlist_artist: WatchlistArtist, limit: int = 10) -> bool:
|
|
"""
|
|
Fetch and store similar artists for a watchlist artist.
|
|
Called after each artist scan to build discovery pool.
|
|
Uses MusicMap to find similar artists and matches them to Spotify.
|
|
"""
|
|
try:
|
|
logger.info(f"Fetching similar artists for {watchlist_artist.artist_name}")
|
|
|
|
# Get similar artists from MusicMap (returns list of artist dicts)
|
|
similar_artists = self._fetch_similar_artists_from_musicmap(watchlist_artist.artist_name, limit=limit)
|
|
|
|
if not similar_artists:
|
|
logger.debug(f"No similar artists found for {watchlist_artist.artist_name}")
|
|
return True # Not an error, just no recommendations
|
|
|
|
logger.info(f"Found {len(similar_artists)} similar artists for {watchlist_artist.artist_name}")
|
|
|
|
# Store each similar artist in database
|
|
stored_count = 0
|
|
for rank, similar_artist in enumerate(similar_artists, 1):
|
|
try:
|
|
# similar_artist is a dict with 'id' and 'name' keys
|
|
success = self.database.add_or_update_similar_artist(
|
|
source_artist_id=watchlist_artist.spotify_artist_id,
|
|
similar_artist_spotify_id=similar_artist['id'],
|
|
similar_artist_name=similar_artist['name'],
|
|
similarity_rank=rank
|
|
)
|
|
|
|
if success:
|
|
stored_count += 1
|
|
logger.debug(f" #{rank}: {similar_artist['name']} (Spotify ID: {similar_artist['id']})")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error storing similar artist {similar_artist.get('name', 'Unknown')}: {e}")
|
|
continue
|
|
|
|
logger.info(f"Stored {stored_count}/{len(similar_artists)} similar artists for {watchlist_artist.artist_name}")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fetching similar artists for {watchlist_artist.artist_name}: {e}")
|
|
return False
|
|
|
|
def populate_discovery_pool(self, top_artists_limit: int = 50, albums_per_artist: int = 10):
|
|
"""
|
|
Populate discovery pool with tracks from top similar artists.
|
|
Called after watchlist scan completes.
|
|
|
|
IMPROVED: Larger pool for better discovery (50 artists x 10 releases = ~500 releases)
|
|
- Checks if pool was updated in last 24 hours (prevents over-polling Spotify)
|
|
- Includes albums, singles, and EPs for comprehensive coverage
|
|
- Appends to existing pool instead of replacing it
|
|
- Cleans up tracks older than 365 days (maintains 1 year rolling window)
|
|
"""
|
|
try:
|
|
from datetime import datetime, timedelta
|
|
import random
|
|
|
|
# Check if we should run (prevents over-polling Spotify)
|
|
if not self.database.should_populate_discovery_pool(hours_threshold=24):
|
|
logger.info("Discovery pool was populated recently (< 24 hours ago). Skipping to avoid over-polling Spotify.")
|
|
return
|
|
|
|
logger.info("Populating discovery pool from similar artists...")
|
|
|
|
# Get top similar artists across all watchlist (ordered by occurrence_count)
|
|
similar_artists = self.database.get_top_similar_artists(limit=top_artists_limit)
|
|
|
|
if not similar_artists:
|
|
logger.info("No similar artists found to populate discovery pool")
|
|
return
|
|
|
|
logger.info(f"Processing {len(similar_artists)} top similar artists for discovery pool")
|
|
|
|
total_tracks_added = 0
|
|
|
|
for artist_idx, similar_artist in enumerate(similar_artists, 1):
|
|
try:
|
|
logger.info(f"[{artist_idx}/{len(similar_artists)}] Processing {similar_artist.similar_artist_name} (occurrence: {similar_artist.occurrence_count})")
|
|
|
|
# Get artist's albums from Spotify
|
|
all_albums = self.spotify_client.get_artist_albums(
|
|
similar_artist.similar_artist_spotify_id,
|
|
album_type='album,single,ep', # Include albums, singles, and EPs for comprehensive discovery
|
|
limit=50
|
|
)
|
|
|
|
if not all_albums:
|
|
logger.debug(f"No albums found for {similar_artist.similar_artist_name}")
|
|
continue
|
|
|
|
# Fetch artist genres once for all tracks of this artist
|
|
artist_genres = []
|
|
try:
|
|
artist_data = self.spotify_client.get_artist(similar_artist.similar_artist_spotify_id)
|
|
if artist_data and 'genres' in artist_data:
|
|
artist_genres = artist_data['genres']
|
|
except Exception as e:
|
|
logger.debug(f"Could not fetch genres for {similar_artist.similar_artist_name}: {e}")
|
|
|
|
# IMPROVED: Smart selection mixing albums, singles, and EPs
|
|
# Prioritize recent releases and popular content
|
|
|
|
# Separate by type for balanced selection
|
|
albums = [a for a in all_albums if hasattr(a, 'album_type') and a.album_type == 'album']
|
|
singles_eps = [a for a in all_albums if hasattr(a, 'album_type') and a.album_type in ['single', 'ep']]
|
|
other = [a for a in all_albums if not hasattr(a, 'album_type')]
|
|
|
|
# Select albums: latest releases + popular older content
|
|
selected_albums = []
|
|
|
|
# Always include 3 most recent releases (any type) - this captures new singles/EPs
|
|
latest_releases = all_albums[:3]
|
|
selected_albums.extend(latest_releases)
|
|
|
|
# Add remaining slots with balanced mix
|
|
remaining_slots = albums_per_artist - len(selected_albums)
|
|
if remaining_slots > 0:
|
|
# Combine remaining albums and singles
|
|
remaining_content = all_albums[3:]
|
|
|
|
if len(remaining_content) > remaining_slots:
|
|
# Randomly select from remaining content
|
|
random_selection = random.sample(remaining_content, remaining_slots)
|
|
selected_albums.extend(random_selection)
|
|
else:
|
|
selected_albums.extend(remaining_content)
|
|
|
|
logger.info(f" Selected {len(selected_albums)} releases from {len(all_albums)} available (albums: {len(albums)}, singles/EPs: {len(singles_eps)})")
|
|
|
|
# Process each selected album
|
|
for album_idx, album in enumerate(selected_albums, 1):
|
|
try:
|
|
# Get full album data with tracks
|
|
album_data = self.spotify_client.get_album(album.id)
|
|
|
|
if not album_data or 'tracks' not in album_data:
|
|
continue
|
|
|
|
tracks = album_data['tracks'].get('items', [])
|
|
logger.debug(f" Album {album_idx}: {album_data.get('name', 'Unknown')} ({len(tracks)} tracks)")
|
|
|
|
# Determine if this is a new release (within last 30 days)
|
|
is_new = False
|
|
try:
|
|
release_date_str = album_data.get('release_date', '')
|
|
if release_date_str:
|
|
if len(release_date_str) == 10: # Full date
|
|
release_date = datetime.strptime(release_date_str, "%Y-%m-%d")
|
|
days_old = (datetime.now() - release_date).days
|
|
is_new = days_old <= 30
|
|
except:
|
|
pass
|
|
|
|
# Add each track to discovery pool
|
|
for track in tracks:
|
|
try:
|
|
# Enhance track object with full album data (including album_type)
|
|
enhanced_track = {
|
|
**track,
|
|
'album': {
|
|
'id': album_data['id'],
|
|
'name': album_data.get('name', 'Unknown Album'),
|
|
'images': album_data.get('images', []),
|
|
'release_date': album_data.get('release_date', ''),
|
|
'album_type': album_data.get('album_type', 'album'),
|
|
'total_tracks': album_data.get('total_tracks', 0)
|
|
}
|
|
}
|
|
|
|
# Build track data for discovery pool
|
|
track_data = {
|
|
'spotify_track_id': track['id'],
|
|
'spotify_album_id': album_data['id'],
|
|
'spotify_artist_id': similar_artist.similar_artist_spotify_id,
|
|
'track_name': track['name'],
|
|
'artist_name': similar_artist.similar_artist_name,
|
|
'album_name': album_data.get('name', 'Unknown Album'),
|
|
'album_cover_url': album_data.get('images', [{}])[0].get('url') if album_data.get('images') else None,
|
|
'duration_ms': track.get('duration_ms', 0),
|
|
'popularity': album_data.get('popularity', 0),
|
|
'release_date': album_data.get('release_date', ''),
|
|
'is_new_release': is_new,
|
|
'track_data_json': enhanced_track, # Store enhanced track with full album data
|
|
'artist_genres': artist_genres # Add cached genres
|
|
}
|
|
|
|
# Add to discovery pool
|
|
if self.database.add_to_discovery_pool(track_data):
|
|
total_tracks_added += 1
|
|
|
|
except Exception as track_error:
|
|
logger.debug(f"Error adding track to discovery pool: {track_error}")
|
|
continue
|
|
|
|
# Small delay between albums
|
|
time.sleep(DELAY_BETWEEN_ALBUMS)
|
|
|
|
except Exception as album_error:
|
|
logger.warning(f"Error processing album: {album_error}")
|
|
continue
|
|
|
|
# Delay between artists
|
|
if artist_idx < len(similar_artists):
|
|
time.sleep(DELAY_BETWEEN_ARTISTS)
|
|
|
|
except Exception as artist_error:
|
|
logger.warning(f"Error processing artist {similar_artist.similar_artist_name}: {artist_error}")
|
|
continue
|
|
|
|
logger.info(f"Discovery pool from similar artists complete: {total_tracks_added} tracks added")
|
|
|
|
# Note: Watchlist artist albums are already in discovery pool from the watchlist scan itself
|
|
# No need to re-fetch them here to avoid duplicate API calls
|
|
|
|
# Add tracks from random database albums for extra variety (reduced to 5 to save API calls)
|
|
logger.info("Adding tracks from database albums to discovery pool...")
|
|
try:
|
|
with self.database._get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT DISTINCT a.title, ar.name as artist_name
|
|
FROM albums a
|
|
JOIN artists ar ON a.artist_id = ar.id
|
|
ORDER BY RANDOM()
|
|
LIMIT 5
|
|
""")
|
|
db_albums = cursor.fetchall()
|
|
|
|
logger.info(f"Processing {len(db_albums)} database albums for discovery pool")
|
|
|
|
for db_idx, album_row in enumerate(db_albums, 1):
|
|
try:
|
|
# Search for album on Spotify
|
|
query = f"album:{album_row['title']} artist:{album_row['artist_name']}"
|
|
search_results = self.spotify_client.search_albums(query, limit=1)
|
|
|
|
if search_results and len(search_results) > 0:
|
|
spotify_album = search_results[0]
|
|
album_data = self.spotify_client.get_album(spotify_album.id)
|
|
|
|
if album_data and 'tracks' in album_data:
|
|
tracks = album_data['tracks'].get('items', [])
|
|
|
|
# Fetch artist genres
|
|
artist_genres = []
|
|
try:
|
|
if album_data.get('artists') and len(album_data['artists']) > 0:
|
|
artist_id = album_data['artists'][0]['id']
|
|
artist_data = self.spotify_client.get_artist(artist_id)
|
|
if artist_data and 'genres' in artist_data:
|
|
artist_genres = artist_data['genres']
|
|
except Exception as e:
|
|
logger.debug(f"Could not fetch genres for album artist: {e}")
|
|
|
|
# Check if new release
|
|
is_new = False
|
|
try:
|
|
release_date_str = album_data.get('release_date', '')
|
|
if release_date_str and len(release_date_str) == 10:
|
|
release_date = datetime.strptime(release_date_str, "%Y-%m-%d")
|
|
days_old = (datetime.now() - release_date).days
|
|
is_new = days_old <= 30
|
|
except:
|
|
pass
|
|
|
|
for track in tracks:
|
|
try:
|
|
# Enhance track object with full album data (including album_type)
|
|
enhanced_track = {
|
|
**track,
|
|
'album': {
|
|
'id': album_data['id'],
|
|
'name': album_row['title'],
|
|
'images': album_data.get('images', []),
|
|
'release_date': album_data.get('release_date', ''),
|
|
'album_type': album_data.get('album_type', 'album'),
|
|
'total_tracks': album_data.get('total_tracks', 0)
|
|
}
|
|
}
|
|
|
|
track_data = {
|
|
'spotify_track_id': track['id'],
|
|
'spotify_album_id': album_data['id'],
|
|
'spotify_artist_id': album_data['artists'][0]['id'] if album_data.get('artists') else '',
|
|
'track_name': track['name'],
|
|
'artist_name': album_row['artist_name'],
|
|
'album_name': album_row['title'],
|
|
'album_cover_url': album_data.get('images', [{}])[0].get('url') if album_data.get('images') else None,
|
|
'duration_ms': track.get('duration_ms', 0),
|
|
'popularity': album_data.get('popularity', 0),
|
|
'release_date': album_data.get('release_date', ''),
|
|
'is_new_release': is_new,
|
|
'track_data_json': enhanced_track, # Store enhanced track with full album data
|
|
'artist_genres': artist_genres
|
|
}
|
|
|
|
if self.database.add_to_discovery_pool(track_data):
|
|
total_tracks_added += 1
|
|
except Exception as track_error:
|
|
continue
|
|
|
|
time.sleep(DELAY_BETWEEN_ALBUMS)
|
|
except Exception as album_error:
|
|
logger.debug(f"Error processing database album {album_row['title']}: {album_error}")
|
|
continue
|
|
|
|
# Rate limit between albums
|
|
if db_idx < len(db_albums):
|
|
time.sleep(DELAY_BETWEEN_ARTISTS)
|
|
|
|
except Exception as db_error:
|
|
logger.warning(f"Error processing database albums: {db_error}")
|
|
|
|
logger.info(f"Discovery pool population complete: {total_tracks_added} total tracks added from all sources")
|
|
|
|
# Clean up tracks older than 365 days (maintain 1 year rolling window)
|
|
logger.info("Cleaning up discovery tracks older than 365 days...")
|
|
deleted_count = self.database.cleanup_old_discovery_tracks(days_threshold=365)
|
|
logger.info(f"Cleaned up {deleted_count} old tracks from discovery pool")
|
|
|
|
# Get final track count for metadata
|
|
with self.database._get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT COUNT(*) as count FROM discovery_pool")
|
|
final_count = cursor.fetchone()['count']
|
|
|
|
# Update timestamp to mark when pool was last populated
|
|
self.database.update_discovery_pool_timestamp(track_count=final_count)
|
|
logger.info(f"Discovery pool now contains {final_count} total tracks (built over time)")
|
|
|
|
# Cache recent albums for discovery page
|
|
logger.info("Caching recent albums for discovery page...")
|
|
self.cache_discovery_recent_albums()
|
|
|
|
# Curate playlists for consistent daily experience
|
|
logger.info("Curating discovery playlists...")
|
|
self.curate_discovery_playlists()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error populating discovery pool: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def update_discovery_pool_incremental(self):
|
|
"""
|
|
Lightweight incremental update for discovery pool - runs every 6 hours.
|
|
|
|
IMPROVED: Quick check for new releases from watchlist artists only
|
|
- Much faster than full populate_discovery_pool (only checks watchlist, not similar artists)
|
|
- Only fetches latest 5 releases per artist
|
|
- Only adds tracks from releases in last 7 days
|
|
- Respects 6-hour cooldown to avoid over-polling
|
|
"""
|
|
try:
|
|
from datetime import datetime, timedelta
|
|
|
|
# Check if we should run (prevents over-polling Spotify)
|
|
if not self.database.should_populate_discovery_pool(hours_threshold=6):
|
|
logger.info("Discovery pool was updated recently (< 6 hours ago). Skipping incremental update.")
|
|
return
|
|
|
|
logger.info("Starting incremental discovery pool update (watchlist artists only)...")
|
|
|
|
watchlist_artists = self.database.get_watchlist_artists()
|
|
if not watchlist_artists:
|
|
logger.info("No watchlist artists to check for incremental update")
|
|
return
|
|
|
|
cutoff_date = datetime.now() - timedelta(days=7) # Only last week's releases
|
|
total_tracks_added = 0
|
|
|
|
for artist_idx, artist in enumerate(watchlist_artists, 1):
|
|
try:
|
|
logger.info(f"[{artist_idx}/{len(watchlist_artists)}] Checking {artist.artist_name} for new releases...")
|
|
|
|
# Only fetch latest 5 releases (much faster than full scan)
|
|
recent_releases = self.spotify_client.get_artist_albums(
|
|
artist.spotify_artist_id,
|
|
album_type='album,single,ep',
|
|
limit=5
|
|
)
|
|
|
|
if not recent_releases:
|
|
continue
|
|
|
|
# Fetch artist genres once for all tracks of this artist
|
|
artist_genres = []
|
|
try:
|
|
artist_data = self.spotify_client.get_artist(artist.spotify_artist_id)
|
|
if artist_data and 'genres' in artist_data:
|
|
artist_genres = artist_data['genres']
|
|
except Exception as e:
|
|
logger.debug(f"Could not fetch genres for {artist.artist_name}: {e}")
|
|
|
|
for release in recent_releases:
|
|
try:
|
|
# Check if release is within cutoff
|
|
if not self.is_album_after_timestamp(release, cutoff_date):
|
|
continue # Skip older releases
|
|
|
|
# Get full album data with tracks
|
|
album_data = self.spotify_client.get_album(release.id)
|
|
if not album_data or 'tracks' not in album_data:
|
|
continue
|
|
|
|
tracks = album_data['tracks'].get('items', [])
|
|
logger.debug(f" New release: {release.name} ({len(tracks)} tracks)")
|
|
|
|
# Determine if this is a new release (within last 30 days)
|
|
is_new = False
|
|
try:
|
|
release_date_str = album_data.get('release_date', '')
|
|
if release_date_str and len(release_date_str) == 10:
|
|
release_date = datetime.strptime(release_date_str, "%Y-%m-%d")
|
|
days_old = (datetime.now() - release_date).days
|
|
is_new = days_old <= 30
|
|
except:
|
|
pass
|
|
|
|
# Add each track to discovery pool
|
|
for track in tracks:
|
|
try:
|
|
# Enhance track object with full album data (including album_type)
|
|
enhanced_track = {
|
|
**track,
|
|
'album': {
|
|
'id': album_data['id'],
|
|
'name': album_data.get('name', 'Unknown Album'),
|
|
'images': album_data.get('images', []),
|
|
'release_date': album_data.get('release_date', ''),
|
|
'album_type': album_data.get('album_type', 'album'),
|
|
'total_tracks': album_data.get('total_tracks', 0)
|
|
}
|
|
}
|
|
|
|
track_data = {
|
|
'spotify_track_id': track['id'],
|
|
'spotify_album_id': album_data['id'],
|
|
'spotify_artist_id': artist.spotify_artist_id,
|
|
'track_name': track['name'],
|
|
'artist_name': artist.artist_name,
|
|
'album_name': album_data.get('name', 'Unknown Album'),
|
|
'album_cover_url': album_data.get('images', [{}])[0].get('url') if album_data.get('images') else None,
|
|
'duration_ms': track.get('duration_ms', 0),
|
|
'popularity': album_data.get('popularity', 0),
|
|
'release_date': album_data.get('release_date', ''),
|
|
'is_new_release': is_new,
|
|
'track_data_json': enhanced_track, # Store enhanced track with full album data
|
|
'artist_genres': artist_genres
|
|
}
|
|
|
|
if self.database.add_to_discovery_pool(track_data):
|
|
total_tracks_added += 1
|
|
|
|
except Exception as track_error:
|
|
logger.debug(f"Error adding track to discovery pool: {track_error}")
|
|
continue
|
|
|
|
except Exception as release_error:
|
|
logger.warning(f"Error processing release: {release_error}")
|
|
continue
|
|
|
|
# Small delay between artists
|
|
if artist_idx < len(watchlist_artists):
|
|
time.sleep(DELAY_BETWEEN_ARTISTS)
|
|
|
|
except Exception as artist_error:
|
|
logger.warning(f"Error checking {artist.artist_name}: {artist_error}")
|
|
continue
|
|
|
|
logger.info(f"Incremental update complete: {total_tracks_added} new tracks added from watchlist artists")
|
|
|
|
# Update timestamp
|
|
if total_tracks_added > 0:
|
|
# Get current track count
|
|
with self.database._get_connection() as conn:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT COUNT(*) as count FROM discovery_pool")
|
|
current_count = cursor.fetchone()['count']
|
|
|
|
self.database.update_discovery_pool_timestamp(track_count=current_count)
|
|
logger.info(f"Discovery pool now contains {current_count} total tracks")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during incremental discovery pool update: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def cache_discovery_recent_albums(self):
|
|
"""
|
|
Cache recent albums from watchlist and similar artists for discover page.
|
|
|
|
IMPROVED: Checks ALL watchlist artists + top similar artists with 14-day window
|
|
(like Spotify's Release Radar) for more comprehensive and fresh content.
|
|
"""
|
|
try:
|
|
from datetime import datetime, timedelta
|
|
import random
|
|
|
|
logger.info("Caching recent albums for discover page...")
|
|
|
|
# Clear existing cache
|
|
self.database.clear_discovery_recent_albums()
|
|
|
|
# IMPROVED: 30-day window for better content variety while staying recent
|
|
cutoff_date = datetime.now() - timedelta(days=30)
|
|
cached_count = 0
|
|
albums_checked = 0
|
|
|
|
# IMPROVED: Check ALL watchlist artists (not random 10)
|
|
watchlist_artists = self.database.get_watchlist_artists()
|
|
|
|
# IMPROVED: Check top 50 similar artists (not random 10 from 30)
|
|
similar_artists = self.database.get_top_similar_artists(limit=50)
|
|
|
|
logger.info(f"Checking albums from {len(watchlist_artists)} watchlist + {len(similar_artists)} similar artists for recent releases (last 14 days)")
|
|
|
|
# Process watchlist artists
|
|
for artist in watchlist_artists:
|
|
try:
|
|
albums = self.spotify_client.get_artist_albums(
|
|
artist.spotify_artist_id,
|
|
album_type='album,single,ep', # Include EPs for comprehensive coverage
|
|
limit=20
|
|
)
|
|
|
|
for album in albums:
|
|
try:
|
|
albums_checked += 1
|
|
if hasattr(album, 'release_date') and album.release_date:
|
|
release_str = album.release_date
|
|
if len(release_str) >= 10:
|
|
release_date = datetime.strptime(release_str[:10], "%Y-%m-%d")
|
|
if release_date >= cutoff_date:
|
|
album_data = {
|
|
'album_spotify_id': album.id,
|
|
'album_name': album.name,
|
|
'artist_name': artist.artist_name,
|
|
'artist_spotify_id': artist.spotify_artist_id,
|
|
'album_cover_url': album.image_url if hasattr(album, 'image_url') else None,
|
|
'release_date': release_str,
|
|
'album_type': album.album_type if hasattr(album, 'album_type') else 'album'
|
|
}
|
|
if self.database.cache_discovery_recent_album(album_data):
|
|
cached_count += 1
|
|
logger.debug(f"Cached recent album: {album.name} by {artist.artist_name} ({release_str})")
|
|
except Exception as e:
|
|
logger.warning(f"Error checking album for recent releases: {e}")
|
|
continue
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Error fetching albums for watchlist artist {artist.artist_name}: {e}")
|
|
continue
|
|
|
|
# Rate limiting between artists
|
|
time.sleep(DELAY_BETWEEN_ARTISTS)
|
|
|
|
# Process similar artists
|
|
for artist in similar_artists:
|
|
try:
|
|
albums = self.spotify_client.get_artist_albums(
|
|
artist.similar_artist_spotify_id,
|
|
album_type='album,single,ep', # Include EPs for comprehensive coverage
|
|
limit=20
|
|
)
|
|
|
|
for album in albums:
|
|
try:
|
|
albums_checked += 1
|
|
if hasattr(album, 'release_date') and album.release_date:
|
|
release_str = album.release_date
|
|
if len(release_str) >= 10:
|
|
release_date = datetime.strptime(release_str[:10], "%Y-%m-%d")
|
|
if release_date >= cutoff_date:
|
|
album_data = {
|
|
'album_spotify_id': album.id,
|
|
'album_name': album.name,
|
|
'artist_name': artist.similar_artist_name,
|
|
'artist_spotify_id': artist.similar_artist_spotify_id,
|
|
'album_cover_url': album.image_url if hasattr(album, 'image_url') else None,
|
|
'release_date': release_str,
|
|
'album_type': album.album_type if hasattr(album, 'album_type') else 'album'
|
|
}
|
|
if self.database.cache_discovery_recent_album(album_data):
|
|
cached_count += 1
|
|
logger.debug(f"Cached recent album: {album.name} by {artist.similar_artist_name} ({release_str})")
|
|
except Exception as e:
|
|
logger.warning(f"Error checking album for recent releases: {e}")
|
|
continue
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Error fetching albums for similar artist {artist.similar_artist_name}: {e}")
|
|
continue
|
|
|
|
# Rate limiting between artists
|
|
time.sleep(DELAY_BETWEEN_ARTISTS)
|
|
|
|
logger.info(f"Cached {cached_count} recent albums from {albums_checked} albums checked (cutoff: {cutoff_date.strftime('%Y-%m-%d')})")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error caching discovery recent albums: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def curate_discovery_playlists(self):
|
|
"""
|
|
Curate consistent playlist selections that stay the same until next discovery pool update.
|
|
|
|
IMPROVED: Spotify-quality curation with popularity scoring and smart algorithms
|
|
- Release Radar: Prioritizes freshness + popularity from recent releases
|
|
- Discovery Weekly: Balanced mix of popular picks, deep cuts, and mid-tier tracks
|
|
"""
|
|
try:
|
|
import random
|
|
from datetime import datetime
|
|
|
|
logger.info("Curating Release Radar playlist...")
|
|
|
|
# 1. Curate Release Radar - 50 tracks from recent albums
|
|
# IMPROVED: Get more albums (50 instead of 20) for better selection
|
|
recent_albums = self.database.get_discovery_recent_albums(limit=50)
|
|
release_radar_tracks = []
|
|
|
|
if recent_albums:
|
|
# Group albums by artist for variety
|
|
albums_by_artist = {}
|
|
for album in recent_albums:
|
|
artist = album['artist_name']
|
|
if artist not in albums_by_artist:
|
|
albums_by_artist[artist] = []
|
|
albums_by_artist[artist].append(album)
|
|
|
|
# Get tracks from each album, grouped by artist
|
|
# IMPROVED: Add popularity scoring for smarter selection
|
|
artist_tracks = {}
|
|
artist_track_data = {} # Store full track data with scores
|
|
|
|
for artist, albums in albums_by_artist.items():
|
|
artist_tracks[artist] = []
|
|
artist_track_data[artist] = []
|
|
|
|
for album in albums:
|
|
try:
|
|
album_data = self.spotify_client.get_album(album['album_spotify_id'])
|
|
if album_data and 'tracks' in album_data:
|
|
# Calculate days since release for recency score
|
|
days_old = 14 # Default
|
|
try:
|
|
release_date_str = album.get('release_date', '')
|
|
if release_date_str and len(release_date_str) >= 10:
|
|
release_date = datetime.strptime(release_date_str[:10], "%Y-%m-%d")
|
|
days_old = (datetime.now() - release_date).days
|
|
except:
|
|
pass
|
|
|
|
for track in album_data['tracks']['items']:
|
|
track_id = track['id']
|
|
|
|
# Calculate track score (Spotify-style)
|
|
# Score factors: recency (50%), popularity (30%), singles bonus (20%)
|
|
recency_score = max(0, 100 - (days_old * 7)) # Newer = higher
|
|
popularity_score = track.get('popularity', album_data.get('popularity', 50))
|
|
is_single = album.get('album_type', 'album') == 'single'
|
|
single_bonus = 20 if is_single else 0
|
|
|
|
total_score = (recency_score * 0.5) + (popularity_score * 0.3) + single_bonus
|
|
|
|
artist_tracks[artist].append(track_id)
|
|
|
|
# Store full track data with score for sorting
|
|
# Only include album metadata (not full album with all tracks)
|
|
full_track = {
|
|
'id': track_id,
|
|
'name': track['name'],
|
|
'artists': track.get('artists', []),
|
|
'album': {
|
|
'id': album_data['id'],
|
|
'name': album_data.get('name', 'Unknown Album'),
|
|
'images': album_data.get('images', []),
|
|
'release_date': album_data.get('release_date', ''),
|
|
'album_type': album_data.get('album_type', 'album'),
|
|
'total_tracks': album_data.get('total_tracks', 0)
|
|
},
|
|
'duration_ms': track.get('duration_ms', 0),
|
|
'popularity': popularity_score,
|
|
'score': total_score,
|
|
'days_old': days_old
|
|
}
|
|
artist_track_data[artist].append(full_track)
|
|
|
|
except Exception as e:
|
|
continue
|
|
|
|
# IMPROVED: Balance by artist with popularity weighting - max 6 tracks per artist
|
|
balanced_tracks = []
|
|
balanced_track_data = []
|
|
|
|
for artist, track_data in artist_track_data.items():
|
|
# Sort by score and take top 6 (not random)
|
|
sorted_tracks = sorted(track_data, key=lambda t: t['score'], reverse=True)
|
|
selected_tracks = sorted_tracks[:6]
|
|
|
|
# Add selected tracks
|
|
for track in selected_tracks:
|
|
balanced_tracks.append(track['id'])
|
|
balanced_track_data.append(track)
|
|
|
|
# IMPROVED: Sort by score first, then shuffle for variety
|
|
balanced_track_data.sort(key=lambda t: t['score'], reverse=True)
|
|
|
|
# Take top 75, then shuffle for final randomization (prevents album grouping)
|
|
top_tracks = balanced_track_data[:75]
|
|
random.shuffle(top_tracks)
|
|
|
|
# Take final 50 tracks
|
|
release_radar_tracks = [track['id'] for track in top_tracks[:50]]
|
|
release_radar_track_data = top_tracks[:50]
|
|
|
|
# Add Release Radar tracks to discovery pool so they're available for fast lookup
|
|
logger.info(f"Adding {len(release_radar_track_data)} Release Radar tracks to discovery pool...")
|
|
|
|
# Cache genres by artist_id to avoid duplicate API calls
|
|
artist_genres_cache = {}
|
|
|
|
for track_data in release_radar_track_data:
|
|
try:
|
|
# Fetch artist genres (with caching)
|
|
artist_genres = []
|
|
if track_data['artists'] and len(track_data['artists']) > 0:
|
|
artist_id = track_data['artists'][0]['id']
|
|
|
|
if artist_id in artist_genres_cache:
|
|
artist_genres = artist_genres_cache[artist_id]
|
|
else:
|
|
try:
|
|
artist_data = self.spotify_client.get_artist(artist_id)
|
|
if artist_data and 'genres' in artist_data:
|
|
artist_genres = artist_data['genres']
|
|
artist_genres_cache[artist_id] = artist_genres
|
|
except Exception as e:
|
|
logger.debug(f"Could not fetch genres for artist {artist_id}: {e}")
|
|
|
|
# Format track data for discovery pool (expects specific structure)
|
|
formatted_track = {
|
|
'spotify_track_id': track_data['id'],
|
|
'spotify_album_id': track_data['album'].get('id', ''),
|
|
'spotify_artist_id': track_data['artists'][0]['id'] if track_data['artists'] else '',
|
|
'track_name': track_data['name'],
|
|
'artist_name': track_data['artists'][0]['name'] if track_data['artists'] else 'Unknown',
|
|
'album_name': track_data['album'].get('name', 'Unknown'),
|
|
'album_cover_url': track_data['album']['images'][0]['url'] if track_data['album'].get('images') else None,
|
|
'duration_ms': track_data.get('duration_ms', 0),
|
|
'popularity': track_data.get('popularity', 0),
|
|
'release_date': track_data['album'].get('release_date', ''),
|
|
'is_new_release': True,
|
|
'track_data_json': track_data,
|
|
'artist_genres': artist_genres
|
|
}
|
|
self.database.add_to_discovery_pool(formatted_track)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to add track {track_data['name']} to discovery pool: {e}")
|
|
continue
|
|
|
|
self.database.save_curated_playlist('release_radar', release_radar_tracks)
|
|
logger.info(f"Release Radar curated: {len(release_radar_tracks)} tracks")
|
|
|
|
# 2. Curate Discovery Weekly - 50 tracks from full discovery pool
|
|
# IMPROVED: Spotify-style algorithm with balanced mix of popular, mid-tier, and deep cuts
|
|
logger.info("Curating Discovery Weekly playlist...")
|
|
discovery_tracks = self.database.get_discovery_pool_tracks(limit=2000, new_releases_only=False)
|
|
|
|
discovery_weekly_tracks = []
|
|
if discovery_tracks:
|
|
# Separate tracks by popularity tiers for balanced selection
|
|
popular_picks = [] # popularity >= 60
|
|
balanced_mix = [] # 40 <= popularity < 60
|
|
deep_cuts = [] # popularity < 40
|
|
|
|
for track in discovery_tracks:
|
|
popularity = track.popularity if hasattr(track, 'popularity') else 50
|
|
|
|
if popularity >= 60:
|
|
popular_picks.append(track)
|
|
elif popularity >= 40:
|
|
balanced_mix.append(track)
|
|
else:
|
|
deep_cuts.append(track)
|
|
|
|
logger.info(f"Discovery pool breakdown: {len(popular_picks)} popular, {len(balanced_mix)} mid-tier, {len(deep_cuts)} deep cuts")
|
|
|
|
# Create balanced playlist (Spotify-style distribution)
|
|
# 40% popular picks (20 tracks)
|
|
# 40% balanced mid-tier (20 tracks)
|
|
# 20% deep cuts (10 tracks)
|
|
selected_tracks = []
|
|
|
|
# Randomly select from each tier
|
|
random.shuffle(popular_picks)
|
|
random.shuffle(balanced_mix)
|
|
random.shuffle(deep_cuts)
|
|
|
|
selected_tracks.extend(popular_picks[:20]) # 20 popular
|
|
selected_tracks.extend(balanced_mix[:20]) # 20 mid-tier
|
|
selected_tracks.extend(deep_cuts[:10]) # 10 deep cuts
|
|
|
|
# Shuffle final selection for variety
|
|
random.shuffle(selected_tracks)
|
|
|
|
# Extract track IDs
|
|
discovery_weekly_tracks = [track.spotify_track_id for track in selected_tracks]
|
|
|
|
logger.info(f"Discovery Weekly composition: {len(popular_picks[:20])} popular + {len(balanced_mix[:20])} mid-tier + {len(deep_cuts[:10])} deep cuts = {len(discovery_weekly_tracks)} total")
|
|
|
|
self.database.save_curated_playlist('discovery_weekly', discovery_weekly_tracks)
|
|
logger.info(f"Discovery Weekly curated: {len(discovery_weekly_tracks)} tracks")
|
|
|
|
logger.info("Playlist curation complete")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error curating discovery playlists: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
def _populate_seasonal_content(self):
|
|
"""
|
|
Populate seasonal content as part of watchlist scan.
|
|
|
|
IMPROVED: Integrated with discovery system
|
|
- Checks if seasonal content needs update (7-day threshold)
|
|
- Populates content for all seasons
|
|
- Curates seasonal playlists
|
|
- Runs once per week automatically
|
|
"""
|
|
try:
|
|
from core.seasonal_discovery import get_seasonal_discovery_service
|
|
|
|
logger.info("Checking seasonal content update...")
|
|
|
|
seasonal_service = get_seasonal_discovery_service(self.spotify_client, self.database)
|
|
|
|
# Get current season to prioritize
|
|
current_season = seasonal_service.get_current_season()
|
|
|
|
if current_season:
|
|
# Always update current season if needed
|
|
if seasonal_service.should_populate_seasonal_content(current_season, days_threshold=7):
|
|
logger.info(f"Populating current season: {current_season}")
|
|
seasonal_service.populate_seasonal_content(current_season)
|
|
seasonal_service.curate_seasonal_playlist(current_season)
|
|
else:
|
|
logger.info(f"Current season '{current_season}' is up to date")
|
|
|
|
# Update other seasons in background (less frequently - 14 day threshold)
|
|
from core.seasonal_discovery import SEASONAL_CONFIG
|
|
for season_key in SEASONAL_CONFIG.keys():
|
|
if season_key == current_season:
|
|
continue # Already handled above
|
|
|
|
if seasonal_service.should_populate_seasonal_content(season_key, days_threshold=14):
|
|
logger.info(f"Populating season: {season_key}")
|
|
seasonal_service.populate_seasonal_content(season_key)
|
|
seasonal_service.curate_seasonal_playlist(season_key)
|
|
|
|
logger.info("Seasonal content update complete")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error populating seasonal content: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
|
|
# Singleton instance
|
|
_watchlist_scanner_instance = None
|
|
|
|
def get_watchlist_scanner(spotify_client: SpotifyClient) -> WatchlistScanner:
|
|
"""Get the global watchlist scanner instance"""
|
|
global _watchlist_scanner_instance
|
|
if _watchlist_scanner_instance is None:
|
|
_watchlist_scanner_instance = WatchlistScanner(spotify_client)
|
|
return _watchlist_scanner_instance |