mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1595 lines
74 KiB
1595 lines
74 KiB
"""
|
|
Universal Metadata Cache — caches all Spotify and iTunes API responses.
|
|
|
|
Stores full JSON responses alongside structured queryable fields for browsing.
|
|
Transparent to callers: check cache before API call, store after success.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import threading
|
|
from datetime import datetime
|
|
from typing import Optional, Dict, List, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Singleton
|
|
_cache_instance = None
|
|
_cache_lock = threading.Lock()
|
|
|
|
|
|
_backfill_done = False
|
|
|
|
def get_metadata_cache():
|
|
"""Get or create the singleton MetadataCache instance."""
|
|
global _cache_instance, _backfill_done
|
|
if _cache_instance is None:
|
|
with _cache_lock:
|
|
if _cache_instance is None:
|
|
_cache_instance = MetadataCache()
|
|
# One-time backfill of Deezer album genres from stored raw_json
|
|
if not _backfill_done:
|
|
_backfill_done = True
|
|
try:
|
|
import threading
|
|
threading.Thread(target=_cache_instance.backfill_deezer_album_genres, daemon=True).start()
|
|
except Exception:
|
|
pass
|
|
return _cache_instance
|
|
|
|
|
|
class MetadataCache:
|
|
"""Caches Spotify and iTunes API responses with structured fields + raw JSON."""
|
|
|
|
def __init__(self):
|
|
# Tables are created by MusicDatabase migration — we just use get_database()
|
|
pass
|
|
|
|
def _get_db(self):
|
|
from database.music_database import get_database
|
|
return get_database()
|
|
|
|
# ─── Entity Methods ───────────────────────────────────────────────
|
|
|
|
def get_entity(self, source: str, entity_type: str, entity_id: str) -> Optional[dict]:
|
|
"""Look up a cached entity. Returns parsed raw_json dict on hit, None on miss."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT id, raw_json, updated_at, ttl_days FROM metadata_cache_entities
|
|
WHERE source = ? AND entity_type = ? AND entity_id = ?
|
|
""", (source, entity_type, entity_id))
|
|
row = cursor.fetchone()
|
|
if row:
|
|
# Inline TTL check — don't serve stale data
|
|
try:
|
|
updated = datetime.fromisoformat(row['updated_at'])
|
|
age_days = (datetime.now() - updated).days
|
|
if age_days > (row['ttl_days'] or 30):
|
|
cursor.execute("DELETE FROM metadata_cache_entities WHERE id = ?", (row['id'],))
|
|
conn.commit()
|
|
return None
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Touch: update access stats
|
|
cursor.execute("""
|
|
UPDATE metadata_cache_entities
|
|
SET last_accessed_at = CURRENT_TIMESTAMP, access_count = access_count + 1
|
|
WHERE id = ?
|
|
""", (row['id'],))
|
|
conn.commit()
|
|
return json.loads(row['raw_json'])
|
|
return None
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.debug(f"Cache lookup error ({source}/{entity_type}/{entity_id}): {e}")
|
|
return None
|
|
|
|
# Names that indicate junk/placeholder data — should not be cached
|
|
_JUNK_NAMES = frozenset({
|
|
'', 'unknown', 'unknown artist', 'unknown album', 'unknown track',
|
|
'untitled', 'none', 'n/a', 'null',
|
|
})
|
|
|
|
def _is_junk_entity(self, fields: dict) -> bool:
|
|
"""Check if extracted fields represent junk/placeholder data."""
|
|
name = (fields.get('name') or '').strip().lower()
|
|
return name in self._JUNK_NAMES
|
|
|
|
def store_entity(self, source: str, entity_type: str, entity_id: str, raw_data: dict) -> None:
|
|
"""Store an entity in the cache. Extracts structured fields from raw_data."""
|
|
if not entity_id or not raw_data:
|
|
return
|
|
try:
|
|
fields = self._extract_fields(source, entity_type, raw_data)
|
|
# Skip validation for synthetic cache entries (_features, _tracks suffixes)
|
|
if not entity_id.endswith('_features') and not entity_id.endswith('_tracks') and self._is_junk_entity(fields):
|
|
logger.debug(f"Rejecting junk entity ({source}/{entity_type}/{entity_id}): name='{fields.get('name')}'")
|
|
return
|
|
raw_json = json.dumps(raw_data, default=str)
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
INSERT OR REPLACE INTO metadata_cache_entities
|
|
(source, entity_type, entity_id, name, image_url, external_urls,
|
|
genres, popularity, followers,
|
|
artist_name, artist_id, release_date, total_tracks, album_type, label,
|
|
album_name, album_id, duration_ms, track_number, disc_number, explicit, isrc, preview_url,
|
|
raw_json, updated_at, last_accessed_at, access_count)
|
|
VALUES (?, ?, ?, ?, ?, ?,
|
|
?, ?, ?,
|
|
?, ?, ?, ?, ?, ?,
|
|
?, ?, ?, ?, ?, ?, ?, ?,
|
|
?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP,
|
|
COALESCE((SELECT access_count FROM metadata_cache_entities
|
|
WHERE source = ? AND entity_type = ? AND entity_id = ?), 0) + 1)
|
|
""", (
|
|
source, entity_type, entity_id,
|
|
fields.get('name', ''),
|
|
fields.get('image_url'),
|
|
fields.get('external_urls'),
|
|
fields.get('genres'),
|
|
fields.get('popularity'),
|
|
fields.get('followers'),
|
|
fields.get('artist_name'),
|
|
fields.get('artist_id'),
|
|
fields.get('release_date'),
|
|
fields.get('total_tracks'),
|
|
fields.get('album_type'),
|
|
fields.get('label'),
|
|
fields.get('album_name'),
|
|
fields.get('album_id'),
|
|
fields.get('duration_ms'),
|
|
fields.get('track_number'),
|
|
fields.get('disc_number'),
|
|
fields.get('explicit'),
|
|
fields.get('isrc'),
|
|
fields.get('preview_url'),
|
|
raw_json,
|
|
source, entity_type, entity_id,
|
|
))
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.debug(f"Cache store error ({source}/{entity_type}/{entity_id}): {e}")
|
|
|
|
def store_entities_bulk(self, source: str, entity_type: str, items: List[Tuple[str, dict]],
|
|
skip_if_exists: bool = False) -> None:
|
|
"""Store multiple entities at once. items = [(entity_id, raw_data), ...]
|
|
|
|
Args:
|
|
skip_if_exists: If True, don't overwrite existing entries. Use this for
|
|
opportunistic caching of simplified data (e.g. from list endpoints)
|
|
to avoid replacing richer data from detail endpoints.
|
|
"""
|
|
if not items:
|
|
return
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
for entity_id, raw_data in items:
|
|
if not entity_id or not raw_data:
|
|
continue
|
|
|
|
if skip_if_exists:
|
|
cursor.execute("""
|
|
SELECT 1 FROM metadata_cache_entities
|
|
WHERE source = ? AND entity_type = ? AND entity_id = ?
|
|
""", (source, entity_type, entity_id))
|
|
if cursor.fetchone():
|
|
continue
|
|
|
|
fields = self._extract_fields(source, entity_type, raw_data)
|
|
if self._is_junk_entity(fields):
|
|
continue
|
|
raw_json = json.dumps(raw_data, default=str)
|
|
cursor.execute("""
|
|
INSERT OR REPLACE INTO metadata_cache_entities
|
|
(source, entity_type, entity_id, name, image_url, external_urls,
|
|
genres, popularity, followers,
|
|
artist_name, artist_id, release_date, total_tracks, album_type, label,
|
|
album_name, album_id, duration_ms, track_number, disc_number, explicit, isrc, preview_url,
|
|
raw_json, updated_at, last_accessed_at, access_count)
|
|
VALUES (?, ?, ?, ?, ?, ?,
|
|
?, ?, ?,
|
|
?, ?, ?, ?, ?, ?,
|
|
?, ?, ?, ?, ?, ?, ?, ?,
|
|
?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP,
|
|
COALESCE((SELECT access_count FROM metadata_cache_entities
|
|
WHERE source = ? AND entity_type = ? AND entity_id = ?), 0) + 1)
|
|
""", (
|
|
source, entity_type, entity_id,
|
|
fields.get('name', ''),
|
|
fields.get('image_url'),
|
|
fields.get('external_urls'),
|
|
fields.get('genres'),
|
|
fields.get('popularity'),
|
|
fields.get('followers'),
|
|
fields.get('artist_name'),
|
|
fields.get('artist_id'),
|
|
fields.get('release_date'),
|
|
fields.get('total_tracks'),
|
|
fields.get('album_type'),
|
|
fields.get('label'),
|
|
fields.get('album_name'),
|
|
fields.get('album_id'),
|
|
fields.get('duration_ms'),
|
|
fields.get('track_number'),
|
|
fields.get('disc_number'),
|
|
fields.get('explicit'),
|
|
fields.get('isrc'),
|
|
fields.get('preview_url'),
|
|
raw_json,
|
|
source, entity_type, entity_id,
|
|
))
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.debug(f"Cache bulk store error ({source}/{entity_type}): {e}")
|
|
|
|
def get_entities_batch(self, source: str, entity_type: str,
|
|
entity_ids: List[str]) -> Tuple[Dict[str, dict], List[str]]:
|
|
"""Batch cache lookup. Returns (found_dict, missing_ids)."""
|
|
found = {}
|
|
missing = []
|
|
if not entity_ids:
|
|
return found, missing
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
# Batch query in chunks of 500 to avoid SQLite variable limit
|
|
for i in range(0, len(entity_ids), 500):
|
|
chunk = entity_ids[i:i + 500]
|
|
placeholders = ','.join('?' * len(chunk))
|
|
cursor.execute(f"""
|
|
SELECT entity_id, raw_json FROM metadata_cache_entities
|
|
WHERE source = ? AND entity_type = ? AND entity_id IN ({placeholders})
|
|
""", [source, entity_type] + chunk)
|
|
for row in cursor.fetchall():
|
|
found[row['entity_id']] = json.loads(row['raw_json'])
|
|
# Touch all found entries
|
|
if found:
|
|
found_in_chunk = [eid for eid in chunk if eid in found]
|
|
if found_in_chunk:
|
|
ph2 = ','.join('?' * len(found_in_chunk))
|
|
cursor.execute(f"""
|
|
UPDATE metadata_cache_entities
|
|
SET last_accessed_at = CURRENT_TIMESTAMP, access_count = access_count + 1
|
|
WHERE source = ? AND entity_type = ? AND entity_id IN ({ph2})
|
|
""", [source, entity_type] + found_in_chunk)
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
missing = [eid for eid in entity_ids if eid not in found]
|
|
except Exception as e:
|
|
logger.debug(f"Cache batch lookup error: {e}")
|
|
missing = entity_ids
|
|
return found, missing
|
|
|
|
# ─── Search Cache Methods ─────────────────────────────────────────
|
|
|
|
def get_search_results(self, source: str, search_type: str,
|
|
query: str, limit: int) -> Optional[List[dict]]:
|
|
"""Look up cached search results. Returns list of raw_json dicts or None."""
|
|
normalized = query.strip().lower()
|
|
if not normalized:
|
|
return None
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT id, result_ids, created_at FROM metadata_cache_searches
|
|
WHERE source = ? AND search_type = ? AND query_normalized = ? AND search_limit = ?
|
|
""", (source, search_type, normalized, limit))
|
|
row = cursor.fetchone()
|
|
if not row:
|
|
return None
|
|
|
|
# Check TTL (7 days for searches)
|
|
try:
|
|
created = datetime.fromisoformat(row['created_at'])
|
|
age_days = (datetime.now() - created).days
|
|
if age_days > 7:
|
|
# Expired — delete and return miss
|
|
cursor.execute("DELETE FROM metadata_cache_searches WHERE id = ?", (row['id'],))
|
|
conn.commit()
|
|
return None
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# Touch search entry
|
|
cursor.execute("""
|
|
UPDATE metadata_cache_searches
|
|
SET last_accessed_at = CURRENT_TIMESTAMP, access_count = access_count + 1
|
|
WHERE id = ?
|
|
""", (row['id'],))
|
|
conn.commit()
|
|
|
|
# Resolve entity IDs to full data
|
|
result_ids = json.loads(row['result_ids'])
|
|
if not result_ids:
|
|
return []
|
|
|
|
results = []
|
|
for eid in result_ids:
|
|
cursor.execute("""
|
|
SELECT raw_json FROM metadata_cache_entities
|
|
WHERE source = ? AND entity_type = ? AND entity_id = ?
|
|
""", (source, search_type, eid))
|
|
erow = cursor.fetchone()
|
|
if erow:
|
|
results.append(json.loads(erow['raw_json']))
|
|
|
|
# Only return if we found all (or most) entries — partial results are unreliable
|
|
if len(results) >= len(result_ids) * 0.8:
|
|
return results
|
|
return None
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.debug(f"Search cache lookup error ({source}/{search_type}/{query}): {e}")
|
|
return None
|
|
|
|
def store_search_results(self, source: str, search_type: str, query: str,
|
|
limit: int, entity_ids: List[str]) -> None:
|
|
"""Store search result mapping. Individual entities should already be stored."""
|
|
normalized = query.strip().lower()
|
|
if not normalized or not entity_ids:
|
|
return
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
INSERT OR REPLACE INTO metadata_cache_searches
|
|
(source, search_type, query_normalized, query_original, result_ids,
|
|
result_count, search_limit, last_accessed_at)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP)
|
|
""", (
|
|
source, search_type, normalized, query.strip(),
|
|
json.dumps(entity_ids), len(entity_ids), limit,
|
|
))
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.debug(f"Search cache store error ({source}/{search_type}/{query}): {e}")
|
|
|
|
# ─── Browsing (for UI) ────────────────────────────────────────────
|
|
|
|
def browse(self, entity_type: str, source: str = None, search: str = None,
|
|
sort: str = 'last_accessed_at', sort_dir: str = 'desc',
|
|
offset: int = 0, limit: int = 48) -> dict:
|
|
"""Paginated browse of cached entities for the UI."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
|
|
where_clauses = ['entity_type = ?']
|
|
params = [entity_type]
|
|
|
|
if source:
|
|
where_clauses.append('source = ?')
|
|
params.append(source)
|
|
|
|
if search:
|
|
search_term = f'%{search}%'
|
|
where_clauses.append('(name LIKE ? OR artist_name LIKE ? OR album_name LIKE ?)')
|
|
params.extend([search_term, search_term, search_term])
|
|
|
|
where_sql = ' AND '.join(where_clauses)
|
|
|
|
# Count total
|
|
cursor.execute(f"SELECT COUNT(*) as cnt FROM metadata_cache_entities WHERE {where_sql}", params)
|
|
total = cursor.fetchone()['cnt']
|
|
|
|
# Validate sort column
|
|
valid_sorts = {'last_accessed_at', 'created_at', 'access_count', 'name', 'popularity', 'updated_at'}
|
|
if sort not in valid_sorts:
|
|
sort = 'last_accessed_at'
|
|
direction = 'ASC' if sort_dir == 'asc' else 'DESC'
|
|
|
|
# Don't select raw_json — it's huge and only needed for detail view
|
|
cursor.execute(f"""
|
|
SELECT id, source, entity_type, entity_id, name, image_url,
|
|
genres, popularity, followers,
|
|
artist_name, artist_id, release_date, total_tracks, album_type, label,
|
|
album_name, album_id, duration_ms, track_number, disc_number, explicit,
|
|
isrc, preview_url, external_urls,
|
|
created_at, updated_at, last_accessed_at, access_count
|
|
FROM metadata_cache_entities
|
|
WHERE {where_sql}
|
|
ORDER BY {sort} {direction}
|
|
LIMIT ? OFFSET ?
|
|
""", params + [limit, offset])
|
|
|
|
items = []
|
|
for row in cursor.fetchall():
|
|
item = dict(row)
|
|
# Parse JSON fields for the UI
|
|
for json_field in ('genres', 'external_urls'):
|
|
if item.get(json_field):
|
|
try:
|
|
item[json_field] = json.loads(item[json_field])
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
items.append(item)
|
|
|
|
return {'items': items, 'total': total, 'offset': offset, 'limit': limit}
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Cache browse error: {e}")
|
|
return {'items': [], 'total': 0, 'offset': offset, 'limit': limit}
|
|
|
|
def get_entity_detail(self, source: str, entity_type: str, entity_id: str) -> Optional[dict]:
|
|
"""Get full entity detail including parsed raw_json for the detail modal."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT * FROM metadata_cache_entities
|
|
WHERE source = ? AND entity_type = ? AND entity_id = ?
|
|
""", (source, entity_type, entity_id))
|
|
row = cursor.fetchone()
|
|
if not row:
|
|
return None
|
|
|
|
# Touch
|
|
cursor.execute("""
|
|
UPDATE metadata_cache_entities
|
|
SET last_accessed_at = CURRENT_TIMESTAMP, access_count = access_count + 1
|
|
WHERE id = ?
|
|
""", (row['id'],))
|
|
conn.commit()
|
|
|
|
item = dict(row)
|
|
# Parse JSON fields
|
|
for json_field in ('genres', 'external_urls', 'raw_json'):
|
|
if item.get(json_field):
|
|
try:
|
|
item[json_field] = json.loads(item[json_field])
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
return item
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Cache detail error: {e}")
|
|
return None
|
|
|
|
# ─── Stats ────────────────────────────────────────────────────────
|
|
|
|
def get_stats(self) -> dict:
|
|
"""Get cache statistics for the dashboard tool card and modal stats bar."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
|
|
stats = {
|
|
'artists': {'spotify': 0, 'itunes': 0, 'deezer': 0},
|
|
'albums': {'spotify': 0, 'itunes': 0, 'deezer': 0},
|
|
'tracks': {'spotify': 0, 'itunes': 0, 'deezer': 0},
|
|
'searches': 0,
|
|
'total_entries': 0,
|
|
'total_hits': 0,
|
|
'oldest': None,
|
|
'newest': None,
|
|
}
|
|
|
|
# Count by type and source — fast via idx_mce_stats covering index
|
|
cursor.execute("""
|
|
SELECT entity_type, source, COUNT(*) as cnt, SUM(access_count) as hits
|
|
FROM metadata_cache_entities
|
|
GROUP BY entity_type, source
|
|
""")
|
|
type_key_map = {'artist': 'artists', 'album': 'albums', 'track': 'tracks'}
|
|
for row in cursor.fetchall():
|
|
et = type_key_map.get(row['entity_type'])
|
|
src = row['source']
|
|
if et and et in stats and src in stats[et]:
|
|
stats[et][src] = row['cnt']
|
|
stats['total_entries'] += row['cnt']
|
|
stats['total_hits'] += (row['hits'] or 0)
|
|
|
|
# Search count
|
|
cursor.execute("SELECT COUNT(*) as cnt FROM metadata_cache_searches")
|
|
stats['searches'] = cursor.fetchone()['cnt']
|
|
|
|
# Oldest and newest
|
|
cursor.execute("SELECT MIN(created_at) as oldest, MAX(created_at) as newest FROM metadata_cache_entities")
|
|
row = cursor.fetchone()
|
|
if row:
|
|
stats['oldest'] = row['oldest']
|
|
stats['newest'] = row['newest']
|
|
|
|
# MusicBrainz cache stats
|
|
try:
|
|
cursor.execute("SELECT COUNT(*) as cnt FROM musicbrainz_cache")
|
|
stats['musicbrainz_total'] = cursor.fetchone()['cnt']
|
|
cursor.execute("SELECT COUNT(*) as cnt FROM musicbrainz_cache WHERE musicbrainz_id IS NULL")
|
|
stats['musicbrainz_failed'] = cursor.fetchone()['cnt']
|
|
except Exception:
|
|
stats['musicbrainz_total'] = 0
|
|
stats['musicbrainz_failed'] = 0
|
|
|
|
return stats
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Cache stats error: {e}")
|
|
return {
|
|
'artists': {'spotify': 0, 'itunes': 0, 'deezer': 0},
|
|
'albums': {'spotify': 0, 'itunes': 0, 'deezer': 0},
|
|
'tracks': {'spotify': 0, 'itunes': 0, 'deezer': 0},
|
|
'searches': 0, 'total_entries': 0, 'total_hits': 0,
|
|
'oldest': None, 'newest': None,
|
|
}
|
|
|
|
# ─── Maintenance ──────────────────────────────────────────────────
|
|
|
|
def evict_expired(self) -> int:
|
|
"""Delete entries that have exceeded their TTL. Returns count of evicted entries."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
|
|
# Entities
|
|
cursor.execute("""
|
|
DELETE FROM metadata_cache_entities
|
|
WHERE julianday('now') - julianday(updated_at) > ttl_days
|
|
""")
|
|
entity_count = cursor.rowcount
|
|
|
|
# Searches
|
|
cursor.execute("""
|
|
DELETE FROM metadata_cache_searches
|
|
WHERE julianday('now') - julianday(created_at) > ttl_days
|
|
""")
|
|
search_count = cursor.rowcount
|
|
|
|
conn.commit()
|
|
total = entity_count + search_count
|
|
if total > 0:
|
|
logger.info(f"Evicted {total} expired cache entries ({entity_count} entities, {search_count} searches)")
|
|
return total
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Cache eviction error: {e}")
|
|
return 0
|
|
|
|
def clean_junk_entities(self) -> int:
|
|
"""Delete cached entities with empty/placeholder names."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
junk_names = "', '".join(self._JUNK_NAMES - {''}) # exclude empty, handled separately
|
|
cursor.execute(f"""
|
|
DELETE FROM metadata_cache_entities
|
|
WHERE (name IS NULL
|
|
OR TRIM(name) = ''
|
|
OR LOWER(TRIM(name)) IN ('{junk_names}'))
|
|
AND entity_id NOT LIKE '%\\_features' ESCAPE '\\'
|
|
AND entity_id NOT LIKE '%\\_tracks' ESCAPE '\\'
|
|
""")
|
|
count = cursor.rowcount
|
|
conn.commit()
|
|
if count > 0:
|
|
logger.info(f"Cleaned {count} junk entities from cache")
|
|
return count
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Junk cleanup error: {e}")
|
|
return 0
|
|
|
|
def clean_orphaned_searches(self) -> int:
|
|
"""Delete search results where <50% of referenced entities still exist."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT id, source, search_type, result_ids FROM metadata_cache_searches")
|
|
rows = cursor.fetchall()
|
|
|
|
dead_ids = []
|
|
for row in rows:
|
|
try:
|
|
result_ids = json.loads(row['result_ids'] or '[]')
|
|
except (json.JSONDecodeError, TypeError):
|
|
dead_ids.append(row['id'])
|
|
continue
|
|
|
|
if not result_ids:
|
|
dead_ids.append(row['id'])
|
|
continue
|
|
|
|
# Check how many referenced entities still exist
|
|
placeholders = ','.join('?' * len(result_ids))
|
|
cursor.execute(f"""
|
|
SELECT COUNT(*) FROM metadata_cache_entities
|
|
WHERE source = ? AND entity_type = ? AND entity_id IN ({placeholders})
|
|
""", [row['source'], row['search_type']] + list(result_ids))
|
|
found = cursor.fetchone()[0]
|
|
|
|
if found < len(result_ids) * 0.5:
|
|
dead_ids.append(row['id'])
|
|
|
|
if dead_ids:
|
|
# Delete in chunks to stay under SQLite variable limit
|
|
for i in range(0, len(dead_ids), 400):
|
|
chunk = dead_ids[i:i + 400]
|
|
placeholders = ','.join('?' * len(chunk))
|
|
cursor.execute(f"DELETE FROM metadata_cache_searches WHERE id IN ({placeholders})", chunk)
|
|
conn.commit()
|
|
|
|
count = len(dead_ids)
|
|
if count > 0:
|
|
logger.info(f"Cleaned {count} orphaned search results from cache")
|
|
return count
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Orphan search cleanup error: {e}")
|
|
return 0
|
|
|
|
def clean_stale_musicbrainz_nulls(self, max_age_days: int = 30) -> int:
|
|
"""Delete MusicBrainz cache entries where lookup found nothing (null MBID) and age > max_age_days."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
DELETE FROM musicbrainz_cache
|
|
WHERE musicbrainz_id IS NULL
|
|
AND julianday('now') - julianday(last_updated) > ?
|
|
""", (max_age_days,))
|
|
count = cursor.rowcount
|
|
conn.commit()
|
|
if count > 0:
|
|
logger.info(f"Cleaned {count} stale MusicBrainz null entries (>{max_age_days} days)")
|
|
return count
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"MusicBrainz null cleanup error: {e}")
|
|
return 0
|
|
|
|
def get_health_stats(self) -> dict:
|
|
"""Return cache health statistics for the repair dashboard."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
stats = {}
|
|
|
|
# Total counts
|
|
cursor.execute("SELECT COUNT(*) FROM metadata_cache_entities")
|
|
stats['total_entities'] = cursor.fetchone()[0]
|
|
|
|
cursor.execute("SELECT COUNT(*) FROM metadata_cache_searches")
|
|
stats['total_searches'] = cursor.fetchone()[0]
|
|
|
|
# Junk entity count
|
|
junk_names = "', '".join(self._JUNK_NAMES - {''})
|
|
cursor.execute(f"""
|
|
SELECT COUNT(*) FROM metadata_cache_entities
|
|
WHERE (name IS NULL OR TRIM(name) = '' OR LOWER(TRIM(name)) IN ('{junk_names}'))
|
|
AND entity_id NOT LIKE '%\\_features' ESCAPE '\\'
|
|
AND entity_id NOT LIKE '%\\_tracks' ESCAPE '\\'
|
|
""")
|
|
stats['junk_entities'] = cursor.fetchone()[0]
|
|
|
|
# By source
|
|
cursor.execute("""
|
|
SELECT source, COUNT(*) FROM metadata_cache_entities GROUP BY source
|
|
""")
|
|
stats['by_source'] = {row[0]: row[1] for row in cursor.fetchall()}
|
|
|
|
# By entity type
|
|
cursor.execute("""
|
|
SELECT entity_type, COUNT(*) FROM metadata_cache_entities GROUP BY entity_type
|
|
""")
|
|
stats['by_type'] = {row[0]: row[1] for row in cursor.fetchall()}
|
|
|
|
# Expiring soon
|
|
cursor.execute("""
|
|
SELECT COUNT(*) FROM metadata_cache_entities
|
|
WHERE julianday('now') - julianday(updated_at) > ttl_days - 1
|
|
""")
|
|
stats['expiring_24h'] = cursor.fetchone()[0]
|
|
|
|
cursor.execute("""
|
|
SELECT COUNT(*) FROM metadata_cache_entities
|
|
WHERE julianday('now') - julianday(updated_at) > ttl_days - 7
|
|
""")
|
|
stats['expiring_7d'] = cursor.fetchone()[0]
|
|
|
|
# Average age
|
|
cursor.execute("""
|
|
SELECT AVG(julianday('now') - julianday(updated_at)) FROM metadata_cache_entities
|
|
""")
|
|
avg = cursor.fetchone()[0]
|
|
stats['avg_age_days'] = round(avg, 1) if avg else 0
|
|
|
|
# MusicBrainz stats
|
|
try:
|
|
cursor.execute("SELECT COUNT(*) FROM musicbrainz_cache")
|
|
stats['total_musicbrainz'] = cursor.fetchone()[0]
|
|
cursor.execute("SELECT COUNT(*) FROM musicbrainz_cache WHERE musicbrainz_id IS NULL")
|
|
stats['stale_mb_nulls'] = cursor.fetchone()[0]
|
|
except Exception:
|
|
stats['total_musicbrainz'] = 0
|
|
stats['stale_mb_nulls'] = 0
|
|
|
|
# Total access hits
|
|
cursor.execute("SELECT SUM(access_count) FROM metadata_cache_entities")
|
|
hits = cursor.fetchone()[0]
|
|
stats['total_access_hits'] = hits or 0
|
|
|
|
return stats
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Cache health stats error: {e}")
|
|
return {}
|
|
|
|
def clear(self, source: str = None, entity_type: str = None) -> int:
|
|
"""Clear cache entries. Optional filters by source and/or entity_type."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
|
|
# Clear entities
|
|
where_parts = []
|
|
params = []
|
|
if source:
|
|
where_parts.append('source = ?')
|
|
params.append(source)
|
|
if entity_type:
|
|
where_parts.append('entity_type = ?')
|
|
params.append(entity_type)
|
|
|
|
if where_parts:
|
|
where_sql = ' AND '.join(where_parts)
|
|
cursor.execute(f"DELETE FROM metadata_cache_entities WHERE {where_sql}", params)
|
|
else:
|
|
cursor.execute("DELETE FROM metadata_cache_entities")
|
|
entity_count = cursor.rowcount
|
|
|
|
# Clear searches (match source and entity_type → search_type)
|
|
search_where = []
|
|
search_params = []
|
|
if source:
|
|
search_where.append('source = ?')
|
|
search_params.append(source)
|
|
if entity_type:
|
|
search_where.append('search_type = ?')
|
|
search_params.append(entity_type)
|
|
|
|
if search_where:
|
|
cursor.execute(f"DELETE FROM metadata_cache_searches WHERE {' AND '.join(search_where)}", search_params)
|
|
else:
|
|
cursor.execute("DELETE FROM metadata_cache_searches")
|
|
search_count = cursor.rowcount
|
|
|
|
conn.commit()
|
|
total = entity_count + search_count
|
|
logger.info(f"Cleared {total} cache entries (source={source}, type={entity_type})")
|
|
return total
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Cache clear error: {e}")
|
|
return 0
|
|
|
|
def clear_musicbrainz(self, failed_only: bool = False) -> int:
|
|
"""Clear MusicBrainz cache entries. If failed_only=True, only clears entries with NULL musicbrainz_id."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
if failed_only:
|
|
cursor.execute("DELETE FROM musicbrainz_cache WHERE musicbrainz_id IS NULL")
|
|
else:
|
|
cursor.execute("DELETE FROM musicbrainz_cache")
|
|
count = cursor.rowcount
|
|
conn.commit()
|
|
logger.info(f"Cleared {count} MusicBrainz cache entries (failed_only={failed_only})")
|
|
return count
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"MusicBrainz cache clear error: {e}")
|
|
return 0
|
|
|
|
# ─── Field Extraction ─────────────────────────────────────────────
|
|
|
|
def _extract_fields(self, source: str, entity_type: str, raw_data: dict) -> dict:
|
|
"""Extract structured queryable fields from a raw API response."""
|
|
if source == 'spotify':
|
|
return self._extract_spotify_fields(entity_type, raw_data)
|
|
elif source == 'itunes':
|
|
return self._extract_itunes_fields(entity_type, raw_data)
|
|
elif source == 'deezer':
|
|
return self._extract_deezer_fields(entity_type, raw_data)
|
|
elif source == 'beatport':
|
|
return self._extract_beatport_fields(entity_type, raw_data)
|
|
return {'name': str(raw_data.get('name', raw_data.get('trackName', '')))}
|
|
|
|
def _extract_spotify_fields(self, entity_type: str, data: dict) -> dict:
|
|
"""Extract fields from Spotify API response."""
|
|
fields = {}
|
|
|
|
if entity_type == 'artist':
|
|
fields['name'] = data.get('name', '')
|
|
fields['genres'] = json.dumps(data.get('genres', []))
|
|
fields['popularity'] = data.get('popularity', 0)
|
|
followers = data.get('followers')
|
|
fields['followers'] = followers.get('total', 0) if isinstance(followers, dict) else 0
|
|
images = data.get('images', [])
|
|
fields['image_url'] = images[0]['url'] if images else None
|
|
fields['external_urls'] = json.dumps(data.get('external_urls', {}))
|
|
|
|
elif entity_type == 'album':
|
|
fields['name'] = data.get('name', '')
|
|
artists = data.get('artists', [])
|
|
if artists:
|
|
fields['artist_name'] = artists[0].get('name', '')
|
|
fields['artist_id'] = artists[0].get('id', '')
|
|
fields['release_date'] = data.get('release_date', '')
|
|
fields['total_tracks'] = data.get('total_tracks', 0)
|
|
fields['album_type'] = data.get('album_type', 'album')
|
|
fields['label'] = data.get('label', '')
|
|
images = data.get('images', [])
|
|
fields['image_url'] = images[0]['url'] if images else None
|
|
fields['genres'] = json.dumps(data.get('genres', []))
|
|
fields['external_urls'] = json.dumps(data.get('external_urls', {}))
|
|
|
|
elif entity_type == 'track':
|
|
fields['name'] = data.get('name', '')
|
|
artists = data.get('artists', [])
|
|
if artists:
|
|
fields['artist_name'] = artists[0].get('name', '')
|
|
fields['artist_id'] = artists[0].get('id', '')
|
|
album = data.get('album', {})
|
|
fields['album_name'] = album.get('name', '')
|
|
fields['album_id'] = album.get('id', '')
|
|
album_images = album.get('images', [])
|
|
fields['image_url'] = album_images[0]['url'] if album_images else None
|
|
fields['duration_ms'] = data.get('duration_ms', 0)
|
|
fields['track_number'] = data.get('track_number')
|
|
fields['disc_number'] = data.get('disc_number', 1)
|
|
fields['explicit'] = 1 if data.get('explicit') else 0
|
|
fields['popularity'] = data.get('popularity', 0)
|
|
ext_ids = data.get('external_ids', {})
|
|
fields['isrc'] = ext_ids.get('isrc') if isinstance(ext_ids, dict) else None
|
|
fields['preview_url'] = data.get('preview_url')
|
|
fields['external_urls'] = json.dumps(data.get('external_urls', {}))
|
|
|
|
return fields
|
|
|
|
# Deezer genre_id → name mapping (from https://api.deezer.com/genre)
|
|
_DEEZER_GENRE_MAP = {
|
|
132: 'Pop', 116: 'Rap/Hip Hop', 122: 'Reggaeton', 152: 'Rock', 113: 'Dance',
|
|
165: 'R&B', 85: 'Alternative', 186: 'Christian', 106: 'Electro', 466: 'Folk',
|
|
144: 'Reggae', 129: 'Jazz', 84: 'Country', 67: 'Salsa', 173: 'Films/Games',
|
|
98: 'Classical', 169: 'Soul & Funk', 2: 'African Music', 16: 'Asian Music',
|
|
153: 'Blues', 75: 'Brazilian Music', 81: 'Indian Music', 95: 'Kids',
|
|
197: 'Latin Music', 73: 'Metal', 464: 'Rap', 174: 'Musicals',
|
|
}
|
|
|
|
def _extract_deezer_fields(self, entity_type: str, data: dict) -> dict:
|
|
"""Extract fields from Deezer API response."""
|
|
fields = {}
|
|
|
|
if entity_type == 'artist':
|
|
fields['name'] = data.get('name', '')
|
|
# Deezer artists don't have genres directly, but may have genre_id from search context
|
|
genre_id = data.get('genre_id')
|
|
if genre_id and genre_id in self._DEEZER_GENRE_MAP:
|
|
fields['genres'] = json.dumps([self._DEEZER_GENRE_MAP[genre_id]])
|
|
else:
|
|
fields['genres'] = '[]'
|
|
fields['popularity'] = 0
|
|
fields['followers'] = data.get('nb_fan', 0)
|
|
fields['image_url'] = data.get('picture_xl') or data.get('picture_big') or data.get('picture_medium')
|
|
urls = {}
|
|
if data.get('link'):
|
|
urls['deezer'] = data['link']
|
|
fields['external_urls'] = json.dumps(urls)
|
|
|
|
elif entity_type == 'album':
|
|
fields['name'] = data.get('title', '')
|
|
artist = data.get('artist', {})
|
|
fields['artist_name'] = artist.get('name', '') if isinstance(artist, dict) else ''
|
|
fields['artist_id'] = str(artist.get('id', '')) if isinstance(artist, dict) else ''
|
|
fields['release_date'] = data.get('release_date', '')
|
|
fields['total_tracks'] = data.get('nb_tracks', 0)
|
|
record_type = data.get('record_type', 'album')
|
|
fields['album_type'] = record_type if record_type in ('single', 'ep', 'album') else 'album'
|
|
fields['label'] = data.get('label', '')
|
|
fields['image_url'] = data.get('cover_xl') or data.get('cover_big') or data.get('cover_medium')
|
|
# Deezer full album response: genres in data.genres.data[].name
|
|
# Deezer search response: genre_id (numeric) — map to name
|
|
dz_genres = data.get('genres', {})
|
|
if isinstance(dz_genres, dict):
|
|
dz_genres = dz_genres.get('data', [])
|
|
if isinstance(dz_genres, list) and dz_genres:
|
|
fields['genres'] = json.dumps([g.get('name', '') for g in dz_genres if isinstance(g, dict) and g.get('name')])
|
|
else:
|
|
genre_id = data.get('genre_id')
|
|
if genre_id and genre_id in self._DEEZER_GENRE_MAP:
|
|
fields['genres'] = json.dumps([self._DEEZER_GENRE_MAP[genre_id]])
|
|
urls = {}
|
|
if data.get('link'):
|
|
urls['deezer'] = data['link']
|
|
fields['external_urls'] = json.dumps(urls)
|
|
|
|
elif entity_type == 'track':
|
|
fields['name'] = data.get('title', '')
|
|
artist = data.get('artist', {})
|
|
fields['artist_name'] = artist.get('name', '') if isinstance(artist, dict) else ''
|
|
fields['artist_id'] = str(artist.get('id', '')) if isinstance(artist, dict) else ''
|
|
album = data.get('album', {})
|
|
fields['album_name'] = album.get('title', '') if isinstance(album, dict) else ''
|
|
fields['album_id'] = str(album.get('id', '')) if isinstance(album, dict) else ''
|
|
fields['image_url'] = (album.get('cover_xl') or album.get('cover_big') or album.get('cover_medium')) if isinstance(album, dict) else None
|
|
fields['duration_ms'] = data.get('duration', 0) * 1000
|
|
fields['track_number'] = data.get('track_position')
|
|
fields['disc_number'] = data.get('disk_number', 1)
|
|
fields['explicit'] = 1 if data.get('explicit_lyrics') else 0
|
|
fields['popularity'] = data.get('rank', 0)
|
|
fields['isrc'] = data.get('isrc')
|
|
fields['preview_url'] = data.get('preview')
|
|
urls = {}
|
|
if data.get('link'):
|
|
urls['deezer'] = data['link']
|
|
fields['external_urls'] = json.dumps(urls)
|
|
|
|
return fields
|
|
|
|
def _extract_itunes_fields(self, entity_type: str, data: dict) -> dict:
|
|
"""Extract fields from iTunes API response."""
|
|
fields = {}
|
|
|
|
def _upscale_artwork(url):
|
|
"""Convert iTunes 100x100 artwork to 600x600."""
|
|
if url and '100x100' in url:
|
|
return url.replace('100x100', '600x600')
|
|
return url
|
|
|
|
if entity_type == 'artist':
|
|
fields['name'] = data.get('artistName', '')
|
|
genre = data.get('primaryGenreName', '')
|
|
fields['genres'] = json.dumps([genre] if genre else [])
|
|
fields['popularity'] = 0
|
|
fields['followers'] = 0
|
|
fields['image_url'] = _upscale_artwork(data.get('artworkUrl100'))
|
|
urls = {}
|
|
if data.get('artistViewUrl'):
|
|
urls['itunes'] = data['artistViewUrl']
|
|
fields['external_urls'] = json.dumps(urls)
|
|
|
|
elif entity_type == 'album':
|
|
fields['name'] = data.get('collectionName', '')
|
|
fields['artist_name'] = data.get('artistName', '')
|
|
fields['artist_id'] = str(data.get('artistId', ''))
|
|
fields['release_date'] = data.get('releaseDate', '')[:10] if data.get('releaseDate') else ''
|
|
fields['total_tracks'] = data.get('trackCount', 0)
|
|
# Infer album type from track count
|
|
tc = data.get('trackCount', 0)
|
|
if tc <= 3:
|
|
fields['album_type'] = 'single'
|
|
elif tc <= 6:
|
|
fields['album_type'] = 'ep'
|
|
else:
|
|
fields['album_type'] = 'album'
|
|
fields['image_url'] = _upscale_artwork(data.get('artworkUrl100'))
|
|
urls = {}
|
|
if data.get('collectionViewUrl'):
|
|
urls['itunes'] = data['collectionViewUrl']
|
|
fields['external_urls'] = json.dumps(urls)
|
|
|
|
elif entity_type == 'track':
|
|
fields['name'] = data.get('trackName', '')
|
|
fields['artist_name'] = data.get('artistName', '')
|
|
fields['artist_id'] = str(data.get('artistId', ''))
|
|
fields['album_name'] = data.get('collectionName', '')
|
|
fields['album_id'] = str(data.get('collectionId', ''))
|
|
fields['image_url'] = _upscale_artwork(data.get('artworkUrl100'))
|
|
fields['duration_ms'] = data.get('trackTimeMillis', 0)
|
|
fields['track_number'] = data.get('trackNumber')
|
|
fields['disc_number'] = data.get('discNumber', 1)
|
|
fields['explicit'] = 1 if data.get('trackExplicitness') == 'explicit' else 0
|
|
fields['preview_url'] = data.get('previewUrl')
|
|
urls = {}
|
|
if data.get('trackViewUrl'):
|
|
urls['itunes'] = data['trackViewUrl']
|
|
fields['external_urls'] = json.dumps(urls)
|
|
|
|
return fields
|
|
|
|
def _extract_beatport_fields(self, entity_type: str, data: dict) -> dict:
|
|
"""Extract fields from Beatport enriched track data."""
|
|
fields = {}
|
|
|
|
if entity_type == 'track':
|
|
fields['name'] = data.get('title', '')
|
|
fields['artist_name'] = data.get('artist', '')
|
|
fields['album_name'] = data.get('release_name', '')
|
|
fields['album_id'] = data.get('release_id', '')
|
|
fields['image_url'] = data.get('release_image', '')
|
|
fields['label'] = data.get('label', '')
|
|
fields['release_date'] = data.get('release_date', '')
|
|
# Beatport duration is in seconds, convert to ms
|
|
duration = data.get('duration', 0)
|
|
fields['duration_ms'] = int(duration) * 1000 if duration else 0
|
|
fields['track_number'] = data.get('position')
|
|
fields['genres'] = json.dumps([data['genre']]) if data.get('genre') else '[]'
|
|
urls = {}
|
|
if data.get('url'):
|
|
urls['beatport'] = data['url']
|
|
fields['external_urls'] = json.dumps(urls)
|
|
|
|
return fields
|
|
|
|
# ─── Discovery Methods (mine cache for recommendations) ──────
|
|
|
|
def get_undiscovered_albums(self, top_artist_names, library_album_keys, source=None, limit=20):
|
|
"""Find popular cached albums by user's top artists that aren't in their library."""
|
|
if not top_artist_names:
|
|
return []
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
placeholders = ','.join(['?'] * len(top_artist_names))
|
|
params = [a.lower() for a in top_artist_names]
|
|
source_filter = "AND source = ?" if source else ""
|
|
if source:
|
|
params.append(source)
|
|
cursor.execute(f"""
|
|
SELECT name, artist_name, image_url, popularity, release_date, label,
|
|
source, entity_id, album_type, total_tracks
|
|
FROM metadata_cache_entities
|
|
WHERE entity_type = 'album'
|
|
AND LOWER(artist_name) IN ({placeholders})
|
|
{source_filter}
|
|
ORDER BY COALESCE(popularity, 0) DESC, access_count DESC
|
|
LIMIT 200
|
|
""", params)
|
|
results = []
|
|
for row in cursor.fetchall():
|
|
key = (row['name'].lower().strip(), row['artist_name'].lower().strip())
|
|
if key not in library_album_keys:
|
|
results.append(dict(row))
|
|
if len(results) >= limit:
|
|
break
|
|
return results
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Undiscovered albums error: {e}")
|
|
return []
|
|
|
|
def get_genre_new_releases(self, user_genres, source=None, sources=None, limit=20):
|
|
"""Find recently released cached albums matching user's genres."""
|
|
if not user_genres:
|
|
return []
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
genre_clauses = ' OR '.join(['genres LIKE ?' for _ in user_genres])
|
|
params = [f'%{g}%' for g in user_genres]
|
|
source_filter = ""
|
|
if sources:
|
|
placeholders = ','.join(['?'] * len(sources))
|
|
source_filter = f"AND source IN ({placeholders})"
|
|
params.extend(sources)
|
|
elif source:
|
|
source_filter = "AND source = ?"
|
|
params.append(source)
|
|
cursor.execute(f"""
|
|
SELECT name, artist_name, image_url, popularity, release_date, genres,
|
|
source, entity_id, album_type, total_tracks
|
|
FROM metadata_cache_entities
|
|
WHERE entity_type = 'album'
|
|
AND release_date != '' AND release_date IS NOT NULL
|
|
AND release_date >= date('now', '-180 days')
|
|
AND ({genre_clauses})
|
|
{source_filter}
|
|
ORDER BY release_date DESC, COALESCE(popularity, 0) DESC
|
|
LIMIT ?
|
|
""", params + [limit])
|
|
return [dict(r) for r in cursor.fetchall()]
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Genre new releases error: {e}")
|
|
return []
|
|
|
|
def get_label_explorer(self, user_labels, source=None, limit=20):
|
|
"""Find popular cached albums from labels the user already has."""
|
|
if not user_labels:
|
|
return []
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
placeholders = ','.join(['?'] * len(user_labels))
|
|
params = list(user_labels)
|
|
source_filter = ""
|
|
if source:
|
|
source_filter = "AND source = ?"
|
|
params.append(source)
|
|
cursor.execute(f"""
|
|
SELECT name, artist_name, image_url, popularity, release_date, label,
|
|
source, entity_id, album_type, total_tracks
|
|
FROM metadata_cache_entities
|
|
WHERE entity_type = 'album'
|
|
AND label IN ({placeholders})
|
|
{source_filter}
|
|
ORDER BY COALESCE(popularity, 0) DESC, access_count DESC
|
|
LIMIT ?
|
|
""", params + [limit])
|
|
return [dict(r) for r in cursor.fetchall()]
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Label explorer error: {e}")
|
|
return []
|
|
|
|
def get_deep_cuts(self, artist_names, source=None, popularity_cap=30, limit=20):
|
|
"""Find low-popularity tracks from artists the user listens to."""
|
|
if not artist_names:
|
|
return []
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
placeholders = ','.join(['?'] * len(artist_names))
|
|
params = [a.lower() for a in artist_names]
|
|
source_filter = ""
|
|
if source:
|
|
source_filter = "AND source = ?"
|
|
params.append(source)
|
|
cursor.execute(f"""
|
|
SELECT name, artist_name, image_url, popularity, album_name,
|
|
source, entity_id, duration_ms, album_id
|
|
FROM metadata_cache_entities
|
|
WHERE entity_type = 'track'
|
|
AND LOWER(artist_name) IN ({placeholders})
|
|
AND (popularity IS NULL OR popularity <= ?)
|
|
{source_filter}
|
|
ORDER BY COALESCE(popularity, 50) ASC, access_count DESC
|
|
LIMIT ?
|
|
""", params + [popularity_cap, limit])
|
|
return [dict(r) for r in cursor.fetchall()]
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Deep cuts error: {e}")
|
|
return []
|
|
|
|
def get_genre_deep_dive(self, genre, source=None, sources=None, artist_limit=12, album_limit=20, track_limit=15):
|
|
"""Get artists, albums, and tracks for a genre. Albums don't have genres in Spotify,
|
|
so we find artists with matching genres then fetch their cached albums and tracks."""
|
|
if not genre:
|
|
return {'artists': [], 'albums': [], 'tracks': []}
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
|
|
# Build source filter for allowed sources
|
|
source_filter = ""
|
|
source_params = []
|
|
if sources:
|
|
placeholders = ','.join(['?'] * len(sources))
|
|
source_filter = f"AND source IN ({placeholders})"
|
|
source_params = list(sources)
|
|
elif source:
|
|
source_filter = "AND source = ?"
|
|
source_params = [source]
|
|
params = [f'%{genre}%'] + source_params
|
|
# Fetch extra to allow dedup across sources
|
|
cursor.execute(f"""
|
|
SELECT name, image_url, popularity, followers, entity_id, source, genres
|
|
FROM metadata_cache_entities
|
|
WHERE entity_type = 'artist'
|
|
AND genres LIKE ?
|
|
{source_filter}
|
|
ORDER BY COALESCE(followers, 0) DESC, COALESCE(popularity, 0) DESC
|
|
LIMIT ?
|
|
""", params + [artist_limit * 3])
|
|
# Deduplicate by name — prefer entry with image, then most followers
|
|
seen_artists = {}
|
|
for r in cursor.fetchall():
|
|
r = dict(r)
|
|
key = r['name'].lower().strip()
|
|
existing = seen_artists.get(key)
|
|
if not existing:
|
|
seen_artists[key] = r
|
|
elif not existing.get('image_url') and r.get('image_url'):
|
|
seen_artists[key] = r
|
|
elif (existing.get('followers') or 0) < (r.get('followers') or 0):
|
|
seen_artists[key] = r
|
|
artists = list(seen_artists.values())[:artist_limit]
|
|
|
|
# If not enough artists found (e.g. Deezer), find artists via album genres
|
|
# Two-step: get artist names from albums, then look up artist entities
|
|
if len(artists) < artist_limit:
|
|
existing_names = {a['name'].lower() for a in artists}
|
|
album_params = [f'%{genre}%'] + source_params
|
|
# Step 1b: Get distinct artist names from genre-matching albums
|
|
cursor.execute(f"""
|
|
SELECT DISTINCT artist_name FROM metadata_cache_entities
|
|
WHERE entity_type = 'album' AND genres LIKE ?
|
|
{source_filter}
|
|
LIMIT 50
|
|
""", album_params)
|
|
album_artist_names = [r['artist_name'] for r in cursor.fetchall()
|
|
if r['artist_name'] and r['artist_name'].lower() not in existing_names]
|
|
|
|
# Step 1c: Look up those artists by name (deduplicate across sources)
|
|
if album_artist_names:
|
|
name_ph = ','.join(['?'] * len(album_artist_names))
|
|
art_params = list(album_artist_names) + source_params
|
|
cursor.execute(f"""
|
|
SELECT name, image_url, popularity, followers, entity_id, source, genres
|
|
FROM metadata_cache_entities
|
|
WHERE entity_type = 'artist'
|
|
AND name COLLATE NOCASE IN ({name_ph})
|
|
{source_filter}
|
|
ORDER BY COALESCE(followers, 0) DESC
|
|
LIMIT ?
|
|
""", art_params + [(artist_limit - len(artists)) * 3])
|
|
for row in cursor.fetchall():
|
|
r = dict(row)
|
|
key = r['name'].lower()
|
|
if key not in existing_names:
|
|
existing_names.add(key)
|
|
artists.append(r)
|
|
if len(artists) >= artist_limit:
|
|
break
|
|
|
|
albums = []
|
|
tracks = []
|
|
original_names = [a['name'] for a in artists if a.get('name')]
|
|
|
|
if original_names:
|
|
name_placeholders = ','.join(['?'] * len(original_names))
|
|
|
|
# Step 2: Find albums by those artists (COLLATE NOCASE on column leverages index)
|
|
album_params = list(original_names) + source_params
|
|
cursor.execute(f"""
|
|
SELECT name, artist_name, image_url, popularity, release_date, label,
|
|
source, entity_id, album_type, total_tracks, genres
|
|
FROM metadata_cache_entities
|
|
WHERE entity_type = 'album'
|
|
AND artist_name COLLATE NOCASE IN ({name_placeholders})
|
|
{source_filter}
|
|
ORDER BY COALESCE(popularity, 0) DESC, RANDOM()
|
|
LIMIT ?
|
|
""", album_params + [album_limit])
|
|
albums = [dict(r) for r in cursor.fetchall()]
|
|
|
|
# Step 3: Find tracks by those artists
|
|
track_params = list(original_names) + source_params
|
|
cursor.execute(f"""
|
|
SELECT name, artist_name, image_url, popularity, album_name,
|
|
source, entity_id, duration_ms, album_id
|
|
FROM metadata_cache_entities
|
|
WHERE entity_type = 'track'
|
|
AND artist_name COLLATE NOCASE IN ({name_placeholders})
|
|
{source_filter}
|
|
ORDER BY COALESCE(popularity, 0) DESC, RANDOM()
|
|
LIMIT ?
|
|
""", track_params + [track_limit])
|
|
tracks = [dict(r) for r in cursor.fetchall()]
|
|
|
|
# Step 4: Find related genres from artist genres + ALL albums by these artists
|
|
related_genres = {}
|
|
genre_lower = genre.lower()
|
|
|
|
# From artist genre data (Spotify/iTunes — multiple genres per artist)
|
|
for artist in artists:
|
|
try:
|
|
artist_genres = json.loads(artist.get('genres', '[]'))
|
|
if isinstance(artist_genres, list):
|
|
for g in artist_genres:
|
|
g_lower = g.strip().lower()
|
|
if g_lower and g_lower != genre_lower:
|
|
related_genres[g_lower] = related_genres.get(g_lower, 0) + 1
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
|
|
# From ALL albums by these artists (not just the 20 we fetched)
|
|
# This finds cross-genre artists (e.g., artist has Pop AND R&B albums)
|
|
if original_names:
|
|
cursor.execute(f"""
|
|
SELECT DISTINCT genres FROM metadata_cache_entities
|
|
WHERE entity_type = 'album'
|
|
AND artist_name COLLATE NOCASE IN ({name_placeholders})
|
|
AND genres IS NOT NULL AND genres != '[]'
|
|
{source_filter}
|
|
""", list(original_names) + source_params)
|
|
for row in cursor.fetchall():
|
|
try:
|
|
parsed = json.loads(row['genres'])
|
|
if isinstance(parsed, list):
|
|
for g in parsed:
|
|
g_lower = g.strip().lower()
|
|
if g_lower and g_lower != genre_lower:
|
|
related_genres[g_lower] = related_genres.get(g_lower, 0) + 1
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
related = sorted(
|
|
[{'genre': g.title(), 'count': c} for g, c in related_genres.items()],
|
|
key=lambda x: x['count'], reverse=True
|
|
)[:12]
|
|
|
|
# Step 5: Check which albums are in the library
|
|
if albums:
|
|
album_keys = [(a['name'].lower().strip(), a['artist_name'].lower().strip()) for a in albums]
|
|
or_clauses = ' OR '.join(['(LOWER(al.title) = ? AND LOWER(ar.name) = ?)' for _ in album_keys])
|
|
lib_params = []
|
|
for k in album_keys:
|
|
lib_params.extend(k)
|
|
cursor.execute(f"""
|
|
SELECT LOWER(al.title), LOWER(ar.name) FROM albums al
|
|
JOIN artists ar ON ar.id = al.artist_id
|
|
WHERE {or_clauses}
|
|
""", lib_params)
|
|
lib_set = {(r[0].strip(), r[1].strip()) for r in cursor.fetchall()}
|
|
for album in albums:
|
|
album['in_library'] = (album['name'].lower().strip(), album['artist_name'].lower().strip()) in lib_set
|
|
|
|
# Step 6: Resolve library artist IDs for navigation (batched)
|
|
if artists:
|
|
lib_name_placeholders = ','.join(['LOWER(?)'] * len(artists))
|
|
lib_name_params = [a['name'] for a in artists]
|
|
try:
|
|
cursor.execute(f"""
|
|
SELECT id, LOWER(name) as lname FROM artists
|
|
WHERE LOWER(name) IN ({lib_name_placeholders})
|
|
""", lib_name_params)
|
|
lib_id_map = {r['lname']: r['id'] for r in cursor.fetchall()}
|
|
except Exception:
|
|
lib_id_map = {}
|
|
for artist in artists:
|
|
artist['library_id'] = lib_id_map.get(artist['name'].lower())
|
|
|
|
return {'artists': artists, 'albums': albums, 'tracks': tracks, 'related_genres': related}
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Genre deep dive error: {e}")
|
|
return {'artists': [], 'albums': [], 'tracks': []}
|
|
|
|
_genre_explorer_cache = {} # {source: (timestamp, results)}
|
|
_GENRE_EXPLORER_TTL = 86400 # 24 hours
|
|
|
|
def get_genre_explorer(self, user_genres_set, source=None, sources=None):
|
|
"""Aggregate genres from cached artists and albums, highlight unexplored ones."""
|
|
import time
|
|
cache_key = ','.join(sorted(sources)) if sources else (source or '_all')
|
|
cached = self._genre_explorer_cache.get(cache_key)
|
|
if cached:
|
|
ts, raw_results = cached
|
|
if time.time() - ts < self._GENRE_EXPLORER_TTL:
|
|
user_lower = {g.lower() for g in user_genres_set} if user_genres_set else set()
|
|
for r in raw_results:
|
|
r['explored'] = r['genre'].lower() in user_lower
|
|
return raw_results
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
params = []
|
|
source_filter = ""
|
|
if sources:
|
|
placeholders = ','.join(['?'] * len(sources))
|
|
source_filter = f"AND source IN ({placeholders})"
|
|
params.extend(sources)
|
|
elif source:
|
|
source_filter = "AND source = ?"
|
|
params.append(source)
|
|
|
|
# Count unique artists per genre from both artist and album entities
|
|
# Artists have genres directly; albums have genre_id-mapped genres + artist_name
|
|
genre_artists = {} # {genre_lower: set(artist_names)}
|
|
|
|
# From artist entities
|
|
cursor.execute(f"""
|
|
SELECT name, genres FROM metadata_cache_entities
|
|
WHERE entity_type = 'artist'
|
|
AND genres IS NOT NULL AND genres != '' AND genres != '[]'
|
|
{source_filter}
|
|
""", params)
|
|
for row in cursor.fetchall():
|
|
try:
|
|
parsed = json.loads(row['genres'])
|
|
if isinstance(parsed, list):
|
|
artist_key = (row['name'] or '').lower()
|
|
for g in parsed:
|
|
g_lower = g.strip().lower()
|
|
if g_lower:
|
|
genre_artists.setdefault(g_lower, set()).add(artist_key)
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
|
|
# From album entities (for Deezer where artists lack genres)
|
|
cursor.execute(f"""
|
|
SELECT artist_name, genres FROM metadata_cache_entities
|
|
WHERE entity_type = 'album'
|
|
AND genres IS NOT NULL AND genres != '' AND genres != '[]'
|
|
{source_filter}
|
|
""", params)
|
|
for row in cursor.fetchall():
|
|
try:
|
|
parsed = json.loads(row['genres'])
|
|
if isinstance(parsed, list):
|
|
artist_key = (row['artist_name'] or '').lower()
|
|
if artist_key:
|
|
for g in parsed:
|
|
g_lower = g.strip().lower()
|
|
if g_lower:
|
|
genre_artists.setdefault(g_lower, set()).add(artist_key)
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
|
|
user_lower = {g.lower() for g in user_genres_set} if user_genres_set else set()
|
|
results = []
|
|
for genre, artists_set in sorted(genre_artists.items(), key=lambda x: len(x[1]), reverse=True)[:50]:
|
|
if len(artists_set) < 2:
|
|
continue # Skip genres with only 1 artist
|
|
results.append({
|
|
'genre': genre.title(),
|
|
'artist_count': len(artists_set),
|
|
'explored': genre in user_lower,
|
|
})
|
|
# Cache for subsequent requests
|
|
self._genre_explorer_cache[cache_key] = (time.time(), results)
|
|
return results
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Genre explorer error: {e}")
|
|
return []
|
|
|
|
def backfill_deezer_album_genres(self):
|
|
"""One-time backfill: extract genres from raw_json for Deezer albums that have genres: '[]'.
|
|
Deezer album API responses include genres in data.genres.data[].name but this wasn't
|
|
extracted in earlier versions. This parses the stored raw_json and updates the genres field."""
|
|
try:
|
|
db = self._get_db()
|
|
conn = db._get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT id, raw_json FROM metadata_cache_entities
|
|
WHERE source = 'deezer' AND entity_type = 'album'
|
|
AND (genres IS NULL OR genres = '' OR genres = '[]')
|
|
AND raw_json IS NOT NULL
|
|
LIMIT 50000
|
|
""")
|
|
updated = 0
|
|
for row in cursor.fetchall():
|
|
try:
|
|
raw = json.loads(row['raw_json'])
|
|
genre_names = []
|
|
|
|
# Try full genres object (from get_album responses)
|
|
dz_genres = raw.get('genres', {})
|
|
if isinstance(dz_genres, dict):
|
|
dz_genres = dz_genres.get('data', [])
|
|
if isinstance(dz_genres, list) and dz_genres:
|
|
genre_names = [g.get('name', '') for g in dz_genres if isinstance(g, dict) and g.get('name')]
|
|
|
|
# Fallback: genre_id from search responses
|
|
if not genre_names:
|
|
genre_id = raw.get('genre_id')
|
|
if genre_id and genre_id in self._DEEZER_GENRE_MAP:
|
|
genre_names = [self._DEEZER_GENRE_MAP[genre_id]]
|
|
|
|
if genre_names:
|
|
cursor.execute(
|
|
"UPDATE metadata_cache_entities SET genres = ? WHERE id = ?",
|
|
(json.dumps(genre_names), row['id'])
|
|
)
|
|
updated += 1
|
|
except Exception:
|
|
continue
|
|
conn.commit()
|
|
logger.info(f"Deezer album genre backfill: updated {updated} albums")
|
|
|
|
# Phase 2: Propagate album genres to Deezer artist entities
|
|
# Match by artist_name or artist_id from albums that have genres
|
|
artist_updated = 0
|
|
cursor.execute("""
|
|
SELECT DISTINCT artist_name, artist_id, genres
|
|
FROM metadata_cache_entities
|
|
WHERE source = 'deezer' AND entity_type = 'album'
|
|
AND genres IS NOT NULL AND genres != '' AND genres != '[]'
|
|
AND (artist_name != '' OR artist_id != '')
|
|
""")
|
|
album_artists = {} # {artist_identifier: set(genres)}
|
|
for row in cursor.fetchall():
|
|
try:
|
|
names = json.loads(row['genres'])
|
|
if not isinstance(names, list):
|
|
continue
|
|
# Key by artist_name or artist_id
|
|
key_name = row['artist_name'] or ''
|
|
key_id = row['artist_id'] or ''
|
|
for key in [k for k in [key_name, key_id] if k]:
|
|
album_artists.setdefault(key, set()).update(names)
|
|
except Exception:
|
|
continue
|
|
|
|
# Update artist entities that have empty genres
|
|
cursor.execute("""
|
|
SELECT id, name, entity_id FROM metadata_cache_entities
|
|
WHERE source = 'deezer' AND entity_type = 'artist'
|
|
AND (genres IS NULL OR genres = '' OR genres = '[]')
|
|
""")
|
|
for row in cursor.fetchall():
|
|
genres = album_artists.get(row['name']) or album_artists.get(row['entity_id']) or set()
|
|
if genres:
|
|
cursor.execute(
|
|
"UPDATE metadata_cache_entities SET genres = ? WHERE id = ?",
|
|
(json.dumps(list(genres)), row['id'])
|
|
)
|
|
artist_updated += 1
|
|
conn.commit()
|
|
logger.info(f"Deezer artist genre backfill: updated {artist_updated} artists from album genres")
|
|
|
|
return updated + artist_updated
|
|
finally:
|
|
conn.close()
|
|
except Exception as e:
|
|
logger.error(f"Deezer genre backfill error: {e}")
|
|
return 0
|