You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/genius_client.py

386 lines
13 KiB

import requests
import re
import time
import threading
from typing import Dict, Optional, Any, List
from functools import wraps
from utils.logging_config import get_logger
logger = get_logger("genius_client")
# Global rate limiting variables
_last_api_call_time = 0
_api_call_lock = threading.Lock()
MIN_API_INTERVAL = 1.5 # 1.5s between calls — Genius 429s at 500ms
_rate_limit_backoff = 0 # Extra backoff seconds after 429
_rate_limit_until = 0 # Timestamp until which all calls should wait
def rate_limited(func):
"""Decorator to enforce rate limiting on Genius API calls with exponential backoff on 429"""
@wraps(func)
def wrapper(*args, **kwargs):
global _last_api_call_time, _rate_limit_backoff, _rate_limit_until
with _api_call_lock:
current_time = time.time()
# If in backoff period from a previous 429, wait it out
if current_time < _rate_limit_until:
wait = _rate_limit_until - current_time
logger.debug(f"Genius rate limit backoff: waiting {wait:.1f}s")
time.sleep(wait)
time_since_last_call = time.time() - _last_api_call_time
if time_since_last_call < MIN_API_INTERVAL:
time.sleep(MIN_API_INTERVAL - time_since_last_call)
_last_api_call_time = time.time()
try:
result = func(*args, **kwargs)
# Success — gradually reduce backoff
if _rate_limit_backoff > 0:
_rate_limit_backoff = max(0, _rate_limit_backoff - 5)
return result
except Exception as e:
if "429" in str(e) or "rate limit" in str(e).lower():
# Exponential backoff: 30s → 60s → 120s (cap at 120s)
_rate_limit_backoff = min(120, max(30, _rate_limit_backoff * 2) if _rate_limit_backoff else 30)
_rate_limit_until = time.time() + _rate_limit_backoff
logger.warning(f"Genius 429 rate limit — backing off {_rate_limit_backoff}s")
time.sleep(_rate_limit_backoff)
raise e
return wrapper
class GeniusClient:
"""Client for interacting with the Genius API (metadata + lyrics scraping)"""
BASE_URL = "https://api.genius.com"
def __init__(self, access_token: str = ""):
self.access_token = access_token
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'SoulSync/1.0',
'Accept': 'application/json'
})
# Separate session for web scraping (no auth header)
self.scrape_session = requests.Session()
self.scrape_session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
})
logger.info("Genius client initialized")
def _make_request(self, endpoint: str, params: Dict = None, timeout: int = 10) -> Optional[Dict]:
"""Make an authenticated request to the Genius API"""
if not self.access_token:
logger.warning("Genius access token not configured")
return None
headers = {
'Authorization': f'Bearer {self.access_token}'
}
try:
response = self.session.get(
f"{self.BASE_URL}{endpoint}",
params=params or {},
headers=headers,
timeout=timeout
)
if response.status_code == 401:
logger.error("Genius API: Invalid access token")
return None
if response.status_code == 404:
return None
response.raise_for_status()
data = response.json()
meta = data.get('meta', {})
if meta.get('status') != 200:
logger.error(f"Genius API error: {meta}")
return None
return data.get('response')
except requests.exceptions.Timeout:
logger.warning(f"Genius API timeout for endpoint: {endpoint}")
return None
except requests.exceptions.HTTPError as e:
if e.response is not None and e.response.status_code == 429:
# Re-raise 429s so the rate_limited decorator can handle backoff
raise
logger.error(f"Genius API request error ({endpoint}): {e}")
return None
except Exception as e:
logger.error(f"Genius API request error ({endpoint}): {e}")
return None
# ── Search Methods ──
@rate_limited
def search(self, query: str, per_page: int = 5) -> List[Dict[str, Any]]:
"""
Search Genius for songs matching a query.
Returns:
List of hit dicts, each containing a 'result' with:
id, title, artist_names, url, song_art_image_url, lyrics_state
"""
data = self._make_request('/search', {
'q': query,
'per_page': per_page
})
if not data:
return []
hits = data.get('hits', [])
return [h for h in hits if h.get('type') == 'song']
@rate_limited
def search_song(self, artist_name: str, track_title: str) -> Optional[Dict[str, Any]]:
"""
Search for a specific song by artist and title.
Returns the best matching song result.
Returns:
Song dict with: id, title, artist_names, url, song_art_image_url,
primary_artist (id, name, url, image_url), album (id, name, url)
"""
query = f"{artist_name} {track_title}"
hits = self.search(query, per_page=5)
if not hits:
logger.debug(f"No results for: {query}")
return None
# Try to find best match by checking artist name
artist_lower = artist_name.lower().strip()
title_lower = track_title.lower().strip()
for hit in hits:
result = hit.get('result', {})
result_artist = (result.get('artist_names') or '').lower()
result_title = (result.get('title') or '').lower()
# Check if artist and title match reasonably
if artist_lower in result_artist or result_artist in artist_lower:
if title_lower in result_title or result_title in title_lower:
logger.debug(f"Found song match: {result.get('title')} by {result.get('artist_names')}")
return result
# No confident match — let the worker mark as not_found and retry later
logger.debug(f"No song match found in search results for: {artist_name} - {track_title}")
return None
# ── Song Methods ──
@rate_limited
def get_song(self, song_id: int) -> Optional[Dict[str, Any]]:
"""
Get detailed song info by Genius song ID.
Returns:
Song dict with: id, title, artist_names, url, song_art_image_url,
description (dom object), album, media, custom_performances,
producer_artists, writer_artists, featured_artists
"""
data = self._make_request(f'/songs/{song_id}', {
'text_format': 'plain'
})
if not data:
return None
song = data.get('song')
if song:
logger.debug(f"Got song info for ID: {song_id}")
return song
return None
# ── Artist Methods ──
@rate_limited
def search_artist(self, artist_name: str) -> Optional[Dict[str, Any]]:
"""
Search for an artist by name (via song search, extract primary_artist).
Returns:
Artist dict with: id, name, url, image_url
"""
hits = self.search(artist_name, per_page=5)
if not hits:
return None
artist_lower = artist_name.lower().strip()
for hit in hits:
result = hit.get('result', {})
primary = result.get('primary_artist', {})
primary_name = (primary.get('name') or '').lower()
if primary and (artist_lower in primary_name or primary_name in artist_lower):
logger.debug(f"Found artist: {primary.get('name')}")
return primary
# No confident match — let the worker mark as not_found and retry later
logger.debug(f"No artist match found in search results for: {artist_name}")
return None
@rate_limited
def get_artist(self, artist_id: int) -> Optional[Dict[str, Any]]:
"""
Get detailed artist info by Genius artist ID.
Returns:
Artist dict with: id, name, url, image_url, description,
alternate_names, facebook_name, twitter_name
"""
data = self._make_request(f'/artists/{artist_id}', {
'text_format': 'plain'
})
if not data:
return None
artist = data.get('artist')
if artist:
logger.debug(f"Got artist info for ID: {artist_id}")
return artist
return None
@rate_limited
def get_artist_songs(self, artist_id: int, sort: str = 'popularity', per_page: int = 20) -> List[Dict[str, Any]]:
"""
Get songs by an artist.
Args:
artist_id: Genius artist ID
sort: Sort order ('popularity', 'title')
per_page: Results per page
Returns:
List of song dicts
"""
data = self._make_request(f'/artists/{artist_id}/songs', {
'sort': sort,
'per_page': per_page
})
if not data:
return []
return data.get('songs', [])
# ── Lyrics Scraping ──
@rate_limited
def get_lyrics(self, song_url: str) -> Optional[str]:
"""
Scrape lyrics from a Genius song page.
The Genius API doesn't provide lyrics directly — they must be scraped from the web page.
Args:
song_url: Full Genius URL (e.g. https://genius.com/Artist-song-lyrics)
Returns:
Lyrics text or None
"""
if not song_url:
return None
try:
response = self.scrape_session.get(song_url, timeout=15)
if response.status_code != 200:
logger.warning(f"Failed to fetch lyrics page: {response.status_code}")
return None
html = response.text
# Extract lyrics from the page
# Genius wraps lyrics in <div data-lyrics-container="true">
lyrics_parts = []
pattern = r'<div[^>]*data-lyrics-container="true"[^>]*>(.*?)</div>'
matches = re.findall(pattern, html, re.DOTALL)
if not matches:
logger.debug(f"No lyrics containers found on page: {song_url}")
return None
for match in matches:
# Clean HTML tags
text = re.sub(r'<br\s*/?>', '\n', match)
text = re.sub(r'<[^>]+>', '', text)
# Decode HTML entities
text = text.replace('&amp;', '&')
text = text.replace('&lt;', '<')
text = text.replace('&gt;', '>')
text = text.replace('&#x27;', "'")
text = text.replace('&quot;', '"')
text = text.replace('&#39;', "'")
lyrics_parts.append(text.strip())
lyrics = '\n'.join(lyrics_parts).strip()
if lyrics:
logger.debug(f"Scraped {len(lyrics)} chars of lyrics from: {song_url}")
return lyrics
return None
except Exception as e:
logger.error(f"Error scraping lyrics from {song_url}: {e}")
return None
def search_and_get_lyrics(self, artist_name: str, track_title: str) -> Optional[str]:
"""
Convenience method: search for a song and scrape its lyrics.
Returns:
Lyrics text or None
"""
song = self.search_song(artist_name, track_title)
if not song:
return None
url = song.get('url')
if not url:
return None
# Check if lyrics are available
lyrics_state = song.get('lyrics_state', '')
if lyrics_state == 'unreleased':
logger.debug(f"Lyrics unreleased for: {artist_name} - {track_title}")
return None
return self.get_lyrics(url)
# ── Utility Methods ──
def extract_description(self, description_data) -> Optional[str]:
"""
Extract plain text description from Genius description object.
When text_format=plain, description comes as {plain: "text"}.
"""
if not description_data:
return None
if isinstance(description_data, dict):
plain = description_data.get('plain', '')
if plain and plain.strip() and plain.strip() != '?':
return plain.strip()
if isinstance(description_data, str) and description_data.strip():
return description_data.strip()
return None
def validate_token(self) -> bool:
"""Test if the access token is valid by making a simple request"""
if not self.access_token:
return False
data = self._make_request('/search', {'q': 'test', 'per_page': 1})
return data is not None