You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/metadata_cache.py

727 lines
32 KiB

"""
Universal Metadata Cache — caches all Spotify and iTunes API responses.
Stores full JSON responses alongside structured queryable fields for browsing.
Transparent to callers: check cache before API call, store after success.
"""
import json
import logging
import threading
from datetime import datetime
from typing import Optional, Dict, List, Tuple
logger = logging.getLogger(__name__)
# Singleton
_cache_instance = None
_cache_lock = threading.Lock()
def get_metadata_cache():
"""Get or create the singleton MetadataCache instance."""
global _cache_instance
if _cache_instance is None:
with _cache_lock:
if _cache_instance is None:
_cache_instance = MetadataCache()
return _cache_instance
class MetadataCache:
"""Caches Spotify and iTunes API responses with structured fields + raw JSON."""
def __init__(self):
# Tables are created by MusicDatabase migration — we just use get_database()
pass
def _get_db(self):
from database.music_database import get_database
return get_database()
# ─── Entity Methods ───────────────────────────────────────────────
def get_entity(self, source: str, entity_type: str, entity_id: str) -> Optional[dict]:
"""Look up a cached entity. Returns parsed raw_json dict on hit, None on miss."""
try:
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
cursor.execute("""
SELECT id, raw_json, updated_at, ttl_days FROM metadata_cache_entities
WHERE source = ? AND entity_type = ? AND entity_id = ?
""", (source, entity_type, entity_id))
row = cursor.fetchone()
if row:
# Inline TTL check — don't serve stale data
try:
updated = datetime.fromisoformat(row['updated_at'])
age_days = (datetime.now() - updated).days
if age_days > (row['ttl_days'] or 30):
cursor.execute("DELETE FROM metadata_cache_entities WHERE id = ?", (row['id'],))
conn.commit()
return None
except (ValueError, TypeError):
pass
# Touch: update access stats
cursor.execute("""
UPDATE metadata_cache_entities
SET last_accessed_at = CURRENT_TIMESTAMP, access_count = access_count + 1
WHERE id = ?
""", (row['id'],))
conn.commit()
return json.loads(row['raw_json'])
return None
finally:
conn.close()
except Exception as e:
logger.debug(f"Cache lookup error ({source}/{entity_type}/{entity_id}): {e}")
return None
def store_entity(self, source: str, entity_type: str, entity_id: str, raw_data: dict) -> None:
"""Store an entity in the cache. Extracts structured fields from raw_data."""
if not entity_id or not raw_data:
return
try:
fields = self._extract_fields(source, entity_type, raw_data)
raw_json = json.dumps(raw_data, default=str)
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
cursor.execute("""
INSERT OR REPLACE INTO metadata_cache_entities
(source, entity_type, entity_id, name, image_url, external_urls,
genres, popularity, followers,
artist_name, artist_id, release_date, total_tracks, album_type, label,
album_name, album_id, duration_ms, track_number, disc_number, explicit, isrc, preview_url,
raw_json, updated_at, last_accessed_at, access_count)
VALUES (?, ?, ?, ?, ?, ?,
?, ?, ?,
?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?,
?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP,
COALESCE((SELECT access_count FROM metadata_cache_entities
WHERE source = ? AND entity_type = ? AND entity_id = ?), 0) + 1)
""", (
source, entity_type, entity_id,
fields.get('name', ''),
fields.get('image_url'),
fields.get('external_urls'),
fields.get('genres'),
fields.get('popularity'),
fields.get('followers'),
fields.get('artist_name'),
fields.get('artist_id'),
fields.get('release_date'),
fields.get('total_tracks'),
fields.get('album_type'),
fields.get('label'),
fields.get('album_name'),
fields.get('album_id'),
fields.get('duration_ms'),
fields.get('track_number'),
fields.get('disc_number'),
fields.get('explicit'),
fields.get('isrc'),
fields.get('preview_url'),
raw_json,
source, entity_type, entity_id,
))
conn.commit()
finally:
conn.close()
except Exception as e:
logger.debug(f"Cache store error ({source}/{entity_type}/{entity_id}): {e}")
def store_entities_bulk(self, source: str, entity_type: str, items: List[Tuple[str, dict]],
skip_if_exists: bool = False) -> None:
"""Store multiple entities at once. items = [(entity_id, raw_data), ...]
Args:
skip_if_exists: If True, don't overwrite existing entries. Use this for
opportunistic caching of simplified data (e.g. from list endpoints)
to avoid replacing richer data from detail endpoints.
"""
if not items:
return
try:
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
for entity_id, raw_data in items:
if not entity_id or not raw_data:
continue
if skip_if_exists:
cursor.execute("""
SELECT 1 FROM metadata_cache_entities
WHERE source = ? AND entity_type = ? AND entity_id = ?
""", (source, entity_type, entity_id))
if cursor.fetchone():
continue
fields = self._extract_fields(source, entity_type, raw_data)
raw_json = json.dumps(raw_data, default=str)
cursor.execute("""
INSERT OR REPLACE INTO metadata_cache_entities
(source, entity_type, entity_id, name, image_url, external_urls,
genres, popularity, followers,
artist_name, artist_id, release_date, total_tracks, album_type, label,
album_name, album_id, duration_ms, track_number, disc_number, explicit, isrc, preview_url,
raw_json, updated_at, last_accessed_at, access_count)
VALUES (?, ?, ?, ?, ?, ?,
?, ?, ?,
?, ?, ?, ?, ?, ?,
?, ?, ?, ?, ?, ?, ?, ?,
?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP,
COALESCE((SELECT access_count FROM metadata_cache_entities
WHERE source = ? AND entity_type = ? AND entity_id = ?), 0) + 1)
""", (
source, entity_type, entity_id,
fields.get('name', ''),
fields.get('image_url'),
fields.get('external_urls'),
fields.get('genres'),
fields.get('popularity'),
fields.get('followers'),
fields.get('artist_name'),
fields.get('artist_id'),
fields.get('release_date'),
fields.get('total_tracks'),
fields.get('album_type'),
fields.get('label'),
fields.get('album_name'),
fields.get('album_id'),
fields.get('duration_ms'),
fields.get('track_number'),
fields.get('disc_number'),
fields.get('explicit'),
fields.get('isrc'),
fields.get('preview_url'),
raw_json,
source, entity_type, entity_id,
))
conn.commit()
finally:
conn.close()
except Exception as e:
logger.debug(f"Cache bulk store error ({source}/{entity_type}): {e}")
def get_entities_batch(self, source: str, entity_type: str,
entity_ids: List[str]) -> Tuple[Dict[str, dict], List[str]]:
"""Batch cache lookup. Returns (found_dict, missing_ids)."""
found = {}
missing = []
if not entity_ids:
return found, missing
try:
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
# Batch query in chunks of 500 to avoid SQLite variable limit
for i in range(0, len(entity_ids), 500):
chunk = entity_ids[i:i + 500]
placeholders = ','.join('?' * len(chunk))
cursor.execute(f"""
SELECT entity_id, raw_json FROM metadata_cache_entities
WHERE source = ? AND entity_type = ? AND entity_id IN ({placeholders})
""", [source, entity_type] + chunk)
for row in cursor.fetchall():
found[row['entity_id']] = json.loads(row['raw_json'])
# Touch all found entries
if found:
found_in_chunk = [eid for eid in chunk if eid in found]
if found_in_chunk:
ph2 = ','.join('?' * len(found_in_chunk))
cursor.execute(f"""
UPDATE metadata_cache_entities
SET last_accessed_at = CURRENT_TIMESTAMP, access_count = access_count + 1
WHERE source = ? AND entity_type = ? AND entity_id IN ({ph2})
""", [source, entity_type] + found_in_chunk)
conn.commit()
finally:
conn.close()
missing = [eid for eid in entity_ids if eid not in found]
except Exception as e:
logger.debug(f"Cache batch lookup error: {e}")
missing = entity_ids
return found, missing
# ─── Search Cache Methods ─────────────────────────────────────────
def get_search_results(self, source: str, search_type: str,
query: str, limit: int) -> Optional[List[dict]]:
"""Look up cached search results. Returns list of raw_json dicts or None."""
normalized = query.strip().lower()
if not normalized:
return None
try:
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
cursor.execute("""
SELECT id, result_ids, created_at FROM metadata_cache_searches
WHERE source = ? AND search_type = ? AND query_normalized = ? AND search_limit = ?
""", (source, search_type, normalized, limit))
row = cursor.fetchone()
if not row:
return None
# Check TTL (7 days for searches)
try:
created = datetime.fromisoformat(row['created_at'])
age_days = (datetime.now() - created).days
if age_days > 7:
# Expired — delete and return miss
cursor.execute("DELETE FROM metadata_cache_searches WHERE id = ?", (row['id'],))
conn.commit()
return None
except (ValueError, TypeError):
pass
# Touch search entry
cursor.execute("""
UPDATE metadata_cache_searches
SET last_accessed_at = CURRENT_TIMESTAMP, access_count = access_count + 1
WHERE id = ?
""", (row['id'],))
conn.commit()
# Resolve entity IDs to full data
result_ids = json.loads(row['result_ids'])
if not result_ids:
return []
results = []
for eid in result_ids:
cursor.execute("""
SELECT raw_json FROM metadata_cache_entities
WHERE source = ? AND entity_type = ? AND entity_id = ?
""", (source, search_type, eid))
erow = cursor.fetchone()
if erow:
results.append(json.loads(erow['raw_json']))
# Only return if we found all (or most) entries — partial results are unreliable
if len(results) >= len(result_ids) * 0.8:
return results
return None
finally:
conn.close()
except Exception as e:
logger.debug(f"Search cache lookup error ({source}/{search_type}/{query}): {e}")
return None
def store_search_results(self, source: str, search_type: str, query: str,
limit: int, entity_ids: List[str]) -> None:
"""Store search result mapping. Individual entities should already be stored."""
normalized = query.strip().lower()
if not normalized or not entity_ids:
return
try:
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
cursor.execute("""
INSERT OR REPLACE INTO metadata_cache_searches
(source, search_type, query_normalized, query_original, result_ids,
result_count, search_limit, last_accessed_at)
VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
""", (
source, search_type, normalized, query.strip(),
json.dumps(entity_ids), len(entity_ids), limit,
))
conn.commit()
finally:
conn.close()
except Exception as e:
logger.debug(f"Search cache store error ({source}/{search_type}/{query}): {e}")
# ─── Browsing (for UI) ────────────────────────────────────────────
def browse(self, entity_type: str, source: str = None, search: str = None,
sort: str = 'last_accessed_at', sort_dir: str = 'desc',
offset: int = 0, limit: int = 48) -> dict:
"""Paginated browse of cached entities for the UI."""
try:
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
where_clauses = ['entity_type = ?']
params = [entity_type]
# Exclude pseudo-entities like album_id_tracks and track_id_features
where_clauses.append(r"entity_id NOT LIKE '%\_tracks' ESCAPE '\'")
where_clauses.append(r"entity_id NOT LIKE '%\_features' ESCAPE '\'")
if source:
where_clauses.append('source = ?')
params.append(source)
if search:
search_term = f'%{search}%'
where_clauses.append('(name LIKE ? OR artist_name LIKE ? OR album_name LIKE ?)')
params.extend([search_term, search_term, search_term])
where_sql = ' AND '.join(where_clauses)
# Count total
cursor.execute(f"SELECT COUNT(*) as cnt FROM metadata_cache_entities WHERE {where_sql}", params)
total = cursor.fetchone()['cnt']
# Validate sort column
valid_sorts = {'last_accessed_at', 'created_at', 'access_count', 'name', 'popularity', 'updated_at'}
if sort not in valid_sorts:
sort = 'last_accessed_at'
direction = 'ASC' if sort_dir == 'asc' else 'DESC'
cursor.execute(f"""
SELECT id, source, entity_type, entity_id, name, image_url,
genres, popularity, followers,
artist_name, artist_id, release_date, total_tracks, album_type, label,
album_name, album_id, duration_ms, track_number, disc_number, explicit,
isrc, preview_url, external_urls,
created_at, updated_at, last_accessed_at, access_count
FROM metadata_cache_entities
WHERE {where_sql}
ORDER BY {sort} {direction}
LIMIT ? OFFSET ?
""", params + [limit, offset])
items = []
for row in cursor.fetchall():
item = dict(row)
# Parse JSON fields for the UI
for json_field in ('genres', 'external_urls'):
if item.get(json_field):
try:
item[json_field] = json.loads(item[json_field])
except (json.JSONDecodeError, TypeError):
pass
items.append(item)
return {'items': items, 'total': total, 'offset': offset, 'limit': limit}
finally:
conn.close()
except Exception as e:
logger.error(f"Cache browse error: {e}")
return {'items': [], 'total': 0, 'offset': offset, 'limit': limit}
def get_entity_detail(self, source: str, entity_type: str, entity_id: str) -> Optional[dict]:
"""Get full entity detail including parsed raw_json for the detail modal."""
try:
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
cursor.execute("""
SELECT * FROM metadata_cache_entities
WHERE source = ? AND entity_type = ? AND entity_id = ?
""", (source, entity_type, entity_id))
row = cursor.fetchone()
if not row:
return None
# Touch
cursor.execute("""
UPDATE metadata_cache_entities
SET last_accessed_at = CURRENT_TIMESTAMP, access_count = access_count + 1
WHERE id = ?
""", (row['id'],))
conn.commit()
item = dict(row)
# Parse JSON fields
for json_field in ('genres', 'external_urls', 'raw_json'):
if item.get(json_field):
try:
item[json_field] = json.loads(item[json_field])
except (json.JSONDecodeError, TypeError):
pass
return item
finally:
conn.close()
except Exception as e:
logger.error(f"Cache detail error: {e}")
return None
# ─── Stats ────────────────────────────────────────────────────────
def get_stats(self) -> dict:
"""Get cache statistics for the dashboard tool card and modal stats bar."""
try:
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
stats = {
'artists': {'spotify': 0, 'itunes': 0},
'albums': {'spotify': 0, 'itunes': 0},
'tracks': {'spotify': 0, 'itunes': 0},
'searches': 0,
'total_entries': 0,
'total_hits': 0,
'oldest': None,
'newest': None,
}
# Count by type and source (exclude pseudo-entities like _tracks, _features)
cursor.execute(r"""
SELECT entity_type, source, COUNT(*) as cnt, SUM(access_count) as hits
FROM metadata_cache_entities
WHERE entity_id NOT LIKE '%\_tracks' ESCAPE '\'
AND entity_id NOT LIKE '%\_features' ESCAPE '\'
GROUP BY entity_type, source
""")
type_key_map = {'artist': 'artists', 'album': 'albums', 'track': 'tracks'}
for row in cursor.fetchall():
et = type_key_map.get(row['entity_type'])
src = row['source']
if et and et in stats and src in stats[et]:
stats[et][src] = row['cnt']
stats['total_entries'] += row['cnt']
stats['total_hits'] += (row['hits'] or 0)
# Search count
cursor.execute("SELECT COUNT(*) as cnt FROM metadata_cache_searches")
stats['searches'] = cursor.fetchone()['cnt']
# Oldest and newest
cursor.execute("SELECT MIN(created_at) as oldest, MAX(created_at) as newest FROM metadata_cache_entities")
row = cursor.fetchone()
if row:
stats['oldest'] = row['oldest']
stats['newest'] = row['newest']
return stats
finally:
conn.close()
except Exception as e:
logger.error(f"Cache stats error: {e}")
return {
'artists': {'spotify': 0, 'itunes': 0},
'albums': {'spotify': 0, 'itunes': 0},
'tracks': {'spotify': 0, 'itunes': 0},
'searches': 0, 'total_entries': 0, 'total_hits': 0,
'oldest': None, 'newest': None,
}
# ─── Maintenance ──────────────────────────────────────────────────
def evict_expired(self) -> int:
"""Delete entries that have exceeded their TTL. Returns count of evicted entries."""
try:
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
# Entities
cursor.execute("""
DELETE FROM metadata_cache_entities
WHERE julianday('now') - julianday(updated_at) > ttl_days
""")
entity_count = cursor.rowcount
# Searches
cursor.execute("""
DELETE FROM metadata_cache_searches
WHERE julianday('now') - julianday(created_at) > ttl_days
""")
search_count = cursor.rowcount
conn.commit()
total = entity_count + search_count
if total > 0:
logger.info(f"Evicted {total} expired cache entries ({entity_count} entities, {search_count} searches)")
return total
finally:
conn.close()
except Exception as e:
logger.error(f"Cache eviction error: {e}")
return 0
def clear(self, source: str = None, entity_type: str = None) -> int:
"""Clear cache entries. Optional filters by source and/or entity_type."""
try:
db = self._get_db()
conn = db._get_connection()
try:
cursor = conn.cursor()
# Clear entities
where_parts = []
params = []
if source:
where_parts.append('source = ?')
params.append(source)
if entity_type:
where_parts.append('entity_type = ?')
params.append(entity_type)
if where_parts:
where_sql = ' AND '.join(where_parts)
cursor.execute(f"DELETE FROM metadata_cache_entities WHERE {where_sql}", params)
else:
cursor.execute("DELETE FROM metadata_cache_entities")
entity_count = cursor.rowcount
# Clear searches (match source and entity_type → search_type)
search_where = []
search_params = []
if source:
search_where.append('source = ?')
search_params.append(source)
if entity_type:
search_where.append('search_type = ?')
search_params.append(entity_type)
if search_where:
cursor.execute(f"DELETE FROM metadata_cache_searches WHERE {' AND '.join(search_where)}", search_params)
else:
cursor.execute("DELETE FROM metadata_cache_searches")
search_count = cursor.rowcount
conn.commit()
total = entity_count + search_count
logger.info(f"Cleared {total} cache entries (source={source}, type={entity_type})")
return total
finally:
conn.close()
except Exception as e:
logger.error(f"Cache clear error: {e}")
return 0
# ─── Field Extraction ─────────────────────────────────────────────
def _extract_fields(self, source: str, entity_type: str, raw_data: dict) -> dict:
"""Extract structured queryable fields from a raw API response."""
if source == 'spotify':
return self._extract_spotify_fields(entity_type, raw_data)
elif source == 'itunes':
return self._extract_itunes_fields(entity_type, raw_data)
return {'name': str(raw_data.get('name', raw_data.get('trackName', '')))}
def _extract_spotify_fields(self, entity_type: str, data: dict) -> dict:
"""Extract fields from Spotify API response."""
fields = {}
if entity_type == 'artist':
fields['name'] = data.get('name', '')
fields['genres'] = json.dumps(data.get('genres', []))
fields['popularity'] = data.get('popularity', 0)
followers = data.get('followers')
fields['followers'] = followers.get('total', 0) if isinstance(followers, dict) else 0
images = data.get('images', [])
fields['image_url'] = images[0]['url'] if images else None
fields['external_urls'] = json.dumps(data.get('external_urls', {}))
elif entity_type == 'album':
fields['name'] = data.get('name', '')
artists = data.get('artists', [])
if artists:
fields['artist_name'] = artists[0].get('name', '')
fields['artist_id'] = artists[0].get('id', '')
fields['release_date'] = data.get('release_date', '')
fields['total_tracks'] = data.get('total_tracks', 0)
fields['album_type'] = data.get('album_type', 'album')
fields['label'] = data.get('label', '')
images = data.get('images', [])
fields['image_url'] = images[0]['url'] if images else None
fields['genres'] = json.dumps(data.get('genres', []))
fields['external_urls'] = json.dumps(data.get('external_urls', {}))
elif entity_type == 'track':
fields['name'] = data.get('name', '')
artists = data.get('artists', [])
if artists:
fields['artist_name'] = artists[0].get('name', '')
fields['artist_id'] = artists[0].get('id', '')
album = data.get('album', {})
fields['album_name'] = album.get('name', '')
fields['album_id'] = album.get('id', '')
album_images = album.get('images', [])
fields['image_url'] = album_images[0]['url'] if album_images else None
fields['duration_ms'] = data.get('duration_ms', 0)
fields['track_number'] = data.get('track_number')
fields['disc_number'] = data.get('disc_number', 1)
fields['explicit'] = 1 if data.get('explicit') else 0
fields['popularity'] = data.get('popularity', 0)
ext_ids = data.get('external_ids', {})
fields['isrc'] = ext_ids.get('isrc') if isinstance(ext_ids, dict) else None
fields['preview_url'] = data.get('preview_url')
fields['external_urls'] = json.dumps(data.get('external_urls', {}))
return fields
def _extract_itunes_fields(self, entity_type: str, data: dict) -> dict:
"""Extract fields from iTunes API response."""
fields = {}
def _upscale_artwork(url):
"""Convert iTunes 100x100 artwork to 600x600."""
if url and '100x100' in url:
return url.replace('100x100', '600x600')
return url
if entity_type == 'artist':
fields['name'] = data.get('artistName', '')
genre = data.get('primaryGenreName', '')
fields['genres'] = json.dumps([genre] if genre else [])
fields['popularity'] = 0
fields['followers'] = 0
fields['image_url'] = _upscale_artwork(data.get('artworkUrl100'))
urls = {}
if data.get('artistViewUrl'):
urls['itunes'] = data['artistViewUrl']
fields['external_urls'] = json.dumps(urls)
elif entity_type == 'album':
fields['name'] = data.get('collectionName', '')
fields['artist_name'] = data.get('artistName', '')
fields['artist_id'] = str(data.get('artistId', ''))
fields['release_date'] = data.get('releaseDate', '')[:10] if data.get('releaseDate') else ''
fields['total_tracks'] = data.get('trackCount', 0)
# Infer album type from track count
tc = data.get('trackCount', 0)
if tc <= 3:
fields['album_type'] = 'single'
elif tc <= 6:
fields['album_type'] = 'ep'
else:
fields['album_type'] = 'album'
fields['image_url'] = _upscale_artwork(data.get('artworkUrl100'))
urls = {}
if data.get('collectionViewUrl'):
urls['itunes'] = data['collectionViewUrl']
fields['external_urls'] = json.dumps(urls)
elif entity_type == 'track':
fields['name'] = data.get('trackName', '')
fields['artist_name'] = data.get('artistName', '')
fields['artist_id'] = str(data.get('artistId', ''))
fields['album_name'] = data.get('collectionName', '')
fields['album_id'] = str(data.get('collectionId', ''))
fields['image_url'] = _upscale_artwork(data.get('artworkUrl100'))
fields['duration_ms'] = data.get('trackTimeMillis', 0)
fields['track_number'] = data.get('trackNumber')
fields['disc_number'] = data.get('discNumber', 1)
fields['explicit'] = 1 if data.get('trackExplicitness') == 'explicit' else 0
fields['preview_url'] = data.get('previewUrl')
urls = {}
if data.get('trackViewUrl'):
urls['itunes'] = data['trackViewUrl']
fields['external_urls'] = json.dumps(urls)
return fields