Lift liked-artist matching to core/artists/liked_match.py

Lifts _match_liked_artists_to_all_sources and
_backfill_liked_artist_images. Both bodies are byte-identical to the
originals. Uses the same _SpotifyClientProxy + _get_*_client shim
pattern as core/artists/map.py so the bodies resolve their original
names without modification.

web_server.py: 37501 → 37245 (-256 lines).
pull/429/head
Broque Thomas 4 weeks ago
parent 3714a46434
commit 0e237f14d4

@ -0,0 +1,313 @@
"""Liked-artist multi-source matching — lifted from web_server.py.
Both function bodies are byte-identical to the originals. The
``spotify_client`` proxy + ``_get_*_client`` shims let the bodies resolve
their original names without any modification.
"""
import logging
import time
from config.settings import config_manager
from core.metadata.registry import (
get_deezer_client,
get_discogs_client,
get_itunes_client,
get_spotify_client,
)
logger = logging.getLogger(__name__)
def _get_itunes_client():
"""Mirror of web_server._get_itunes_client — delegates to registry."""
return get_itunes_client()
def _get_deezer_client():
"""Mirror of web_server._get_deezer_client — delegates to registry."""
return get_deezer_client()
def _get_discogs_client(token=None):
"""Mirror of web_server._get_discogs_client — delegates to registry."""
return get_discogs_client(token)
class _SpotifyClientProxy:
"""Resolves the global Spotify client lazily so a Spotify re-auth that
rebinds the cached client in core.metadata.registry is visible to the
lifted bodies."""
def __getattr__(self, name):
client = get_spotify_client()
if client is None:
raise AttributeError(name)
return getattr(client, name)
def __bool__(self):
return get_spotify_client() is not None
spotify_client = _SpotifyClientProxy()
def _match_liked_artists_to_all_sources(database, profile_id: int):
"""Match pending liked artists to ALL metadata sources (Spotify, iTunes, Deezer, Discogs).
Uses the same matching pattern as the watchlist scanner: DB-first, then API search
with fuzzy name matching. Stores all resolved IDs so source switching works instantly."""
pending = database.get_liked_artists_pending_match(profile_id, limit=200)
if not pending:
return
# Source → column mapping
source_cols = {
'spotify': 'spotify_artist_id',
'itunes': 'itunes_artist_id',
'deezer': 'deezer_artist_id',
'discogs': 'discogs_artist_id',
}
id_cols = list(source_cols.values())
# Reject known placeholder images and local server paths
_placeholder_hashes = {'2a96cbd8b46e442fc41c2b86b821562f'}
def _valid_image(url):
if not url or not url.strip():
return None
if any(ph in url for ph in _placeholder_hashes):
return None
# Reject local media server paths (Plex/Jellyfin) — not loadable in browser
if url.startswith('/') or url.startswith('\\'):
return None
if not url.startswith('http'):
return None
return url
# Build search clients for each source
from core.deezer_client import DeezerClient
search_clients = {}
if spotify_client and spotify_client.is_spotify_authenticated():
search_clients['spotify'] = spotify_client
try:
search_clients['itunes'] = _get_itunes_client()
except Exception:
pass
try:
search_clients['deezer'] = _get_deezer_client()
except Exception:
pass
try:
dc = _get_discogs_client()
# Only use Discogs if token is configured
from config.settings import config_manager as _cm
if _cm.get('discogs.token', ''):
search_clients['discogs'] = dc
except Exception:
pass
# Reuse watchlist scanner's fuzzy matching logic
from core.watchlist_scanner import WatchlistScanner
_normalize = WatchlistScanner._normalize_artist_name
def _best_match(results, artist_name):
"""Pick best match from search results using name similarity (same as watchlist scanner)."""
if not results:
return None
# Exact normalized match
for r in results:
if _normalize(r.name) == _normalize(artist_name):
return r
# Fuzzy scoring
best = None
best_sim = 0
for r in results:
# Simple normalized comparison
n1 = _normalize(artist_name)
n2 = _normalize(r.name)
if n1 == n2:
return r
# Levenshtein-style similarity
max_len = max(len(n1), len(n2))
if max_len == 0:
continue
distance = sum(1 for a, b in zip(n1, n2, strict=False) if a != b) + abs(len(n1) - len(n2))
sim = (max_len - distance) / max_len
if sim > best_sim:
best_sim = sim
best = r
if best and best_sim >= 0.85:
return best
return None
api_calls = 0
matched = 0
for entry in pending:
name = entry['artist_name']
pool_id = entry['id']
harvested_ids = {}
best_image = None
# Pre-load existing IDs from the entry itself
for col in id_cols:
if entry.get(col):
harvested_ids[col] = entry[col]
# --- DB STRATEGIES (free, no API calls) ---
# 1. Library artists table
try:
conn = database._get_connection()
cursor = conn.cursor()
cursor.execute("SELECT * FROM artists WHERE name = ? COLLATE NOCASE LIMIT 1", (name,))
row = cursor.fetchone()
if row:
r = dict(row)
for col in id_cols:
if r.get(col) and col not in harvested_ids:
harvested_ids[col] = str(r[col])
if _valid_image(r.get('thumb_url')):
best_image = r['thumb_url']
except Exception:
pass
# 2. Watchlist artists
try:
conn = database._get_connection()
cursor = conn.cursor()
cursor.execute(
"SELECT * FROM watchlist_artists WHERE artist_name = ? COLLATE NOCASE AND profile_id = ? LIMIT 1",
(name, profile_id)
)
row = cursor.fetchone()
if row:
wl = dict(row)
for col in id_cols:
if wl.get(col) and col not in harvested_ids:
harvested_ids[col] = str(wl[col])
if _valid_image(wl.get('image_url')) and not best_image:
best_image = wl['image_url']
except Exception:
pass
# 3. Metadata cache (all sources)
try:
conn = database._get_connection()
cursor = conn.cursor()
cursor.execute(
"SELECT entity_id, source, image_url FROM metadata_cache_entities WHERE entity_type = 'artist' AND name = ? COLLATE NOCASE",
(name,)
)
for row in cursor.fetchall():
col = source_cols.get(row['source'])
if col and col not in harvested_ids:
harvested_ids[col] = row['entity_id']
if _valid_image(row['image_url']) and not best_image:
best_image = row['image_url']
except Exception:
pass
# --- API STRATEGIES (search each missing source) ---
# Same pattern as watchlist scanner's _backfill_missing_ids
for source, col in source_cols.items():
if col in harvested_ids:
continue # Already have this source's ID
client = search_clients.get(source)
if not client:
continue
if api_calls >= 200: # Hard cap per refresh cycle
break
try:
results = client.search_artists(name, limit=5)
best = _best_match(results, name)
if best:
harvested_ids[col] = best.id
if hasattr(best, 'image_url') and _valid_image(best.image_url) and not best_image:
best_image = best.image_url
api_calls += 1
time.sleep(0.4) # Rate limit breathing room
except Exception as e:
logger.debug(f"[Your Artists] {source} search failed for '{name}': {e}")
api_calls += 1
# Save all harvested IDs
if harvested_ids:
# Determine best active source/ID — prefer Spotify, then iTunes, Deezer, Discogs
resolved_source = None
resolved_id = None
for src in ('spotify', 'itunes', 'deezer', 'discogs'):
col = source_cols[src]
if col in harvested_ids:
resolved_source = src
resolved_id = harvested_ids[col]
break
database.update_liked_artist_match(
pool_id, active_source=resolved_source, active_source_id=resolved_id,
image_url=best_image, all_ids=harvested_ids
)
matched += 1
database.sync_liked_artists_watchlist_flags(profile_id)
logger.info(f"[Your Artists] Matched {matched}/{len(pending)} artists to {len(search_clients)} sources ({api_calls} API calls)")
# Image backfill: fetch images for matched artists that have IDs but no image
_backfill_liked_artist_images(database, profile_id, search_clients)
def _backfill_liked_artist_images(database, profile_id: int, search_clients: dict):
"""Fetch images for matched artists missing artwork using their stored source IDs."""
try:
conn = database._get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT id, artist_name, spotify_artist_id, itunes_artist_id, deezer_artist_id
FROM liked_artists_pool
WHERE profile_id = ? AND match_status = 'matched'
AND (image_url IS NULL OR image_url = ''
OR image_url LIKE '%2a96cbd8b46e442fc41c2b86b821562f%'
OR image_url NOT LIKE 'http%')
LIMIT 100
""", (profile_id,))
rows = cursor.fetchall()
if not rows:
return
logger.info(f"[Your Artists] Backfilling images for {len(rows)} artists...")
filled = 0
for row in rows:
r = dict(row)
image_url = None
# Try Spotify artist lookup (has best images)
if r.get('spotify_artist_id') and 'spotify' in search_clients:
try:
sp = search_clients['spotify']
if hasattr(sp, 'sp') and sp.sp:
artist_data = sp.sp.artist(r['spotify_artist_id'])
if artist_data and artist_data.get('images'):
image_url = artist_data['images'][0]['url']
except Exception:
pass
# Try Deezer (direct image URL from ID)
if not image_url and r.get('deezer_artist_id'):
image_url = f"https://api.deezer.com/artist/{r['deezer_artist_id']}/image?size=big"
if image_url:
try:
cursor2 = conn.cursor()
cursor2.execute(
"UPDATE liked_artists_pool SET image_url = ? WHERE id = ?",
(image_url, r['id'])
)
filled += 1
except Exception:
pass
time.sleep(0.3)
conn.commit()
if filled:
logger.info(f"[Your Artists] Backfilled {filled}/{len(rows)} artist images")
except Exception as e:
logger.debug(f"[Your Artists] Image backfill error: {e}")

@ -29594,266 +29594,10 @@ def _fetch_and_match_liked_artists(profile_id: int):
_match_liked_artists_to_all_sources(database, profile_id)
def _match_liked_artists_to_all_sources(database, profile_id: int):
"""Match pending liked artists to ALL metadata sources (Spotify, iTunes, Deezer, Discogs).
Uses the same matching pattern as the watchlist scanner: DB-first, then API search
with fuzzy name matching. Stores all resolved IDs so source switching works instantly."""
pending = database.get_liked_artists_pending_match(profile_id, limit=200)
if not pending:
return
# Source → column mapping
source_cols = {
'spotify': 'spotify_artist_id',
'itunes': 'itunes_artist_id',
'deezer': 'deezer_artist_id',
'discogs': 'discogs_artist_id',
}
id_cols = list(source_cols.values())
# Reject known placeholder images and local server paths
_placeholder_hashes = {'2a96cbd8b46e442fc41c2b86b821562f'}
def _valid_image(url):
if not url or not url.strip():
return None
if any(ph in url for ph in _placeholder_hashes):
return None
# Reject local media server paths (Plex/Jellyfin) — not loadable in browser
if url.startswith('/') or url.startswith('\\'):
return None
if not url.startswith('http'):
return None
return url
# Build search clients for each source
from core.deezer_client import DeezerClient
search_clients = {}
if spotify_client and spotify_client.is_spotify_authenticated():
search_clients['spotify'] = spotify_client
try:
search_clients['itunes'] = _get_itunes_client()
except Exception:
pass
try:
search_clients['deezer'] = _get_deezer_client()
except Exception:
pass
try:
dc = _get_discogs_client()
# Only use Discogs if token is configured
from config.settings import config_manager as _cm
if _cm.get('discogs.token', ''):
search_clients['discogs'] = dc
except Exception:
pass
# Reuse watchlist scanner's fuzzy matching logic
from core.watchlist_scanner import WatchlistScanner
_normalize = WatchlistScanner._normalize_artist_name
def _best_match(results, artist_name):
"""Pick best match from search results using name similarity (same as watchlist scanner)."""
if not results:
return None
# Exact normalized match
for r in results:
if _normalize(r.name) == _normalize(artist_name):
return r
# Fuzzy scoring
best = None
best_sim = 0
for r in results:
# Simple normalized comparison
n1 = _normalize(artist_name)
n2 = _normalize(r.name)
if n1 == n2:
return r
# Levenshtein-style similarity
max_len = max(len(n1), len(n2))
if max_len == 0:
continue
distance = sum(1 for a, b in zip(n1, n2, strict=False) if a != b) + abs(len(n1) - len(n2))
sim = (max_len - distance) / max_len
if sim > best_sim:
best_sim = sim
best = r
if best and best_sim >= 0.85:
return best
return None
api_calls = 0
matched = 0
for entry in pending:
name = entry['artist_name']
pool_id = entry['id']
harvested_ids = {}
best_image = None
# Pre-load existing IDs from the entry itself
for col in id_cols:
if entry.get(col):
harvested_ids[col] = entry[col]
# --- DB STRATEGIES (free, no API calls) ---
# 1. Library artists table
try:
conn = database._get_connection()
cursor = conn.cursor()
cursor.execute("SELECT * FROM artists WHERE name = ? COLLATE NOCASE LIMIT 1", (name,))
row = cursor.fetchone()
if row:
r = dict(row)
for col in id_cols:
if r.get(col) and col not in harvested_ids:
harvested_ids[col] = str(r[col])
if _valid_image(r.get('thumb_url')):
best_image = r['thumb_url']
except Exception:
pass
# 2. Watchlist artists
try:
conn = database._get_connection()
cursor = conn.cursor()
cursor.execute(
"SELECT * FROM watchlist_artists WHERE artist_name = ? COLLATE NOCASE AND profile_id = ? LIMIT 1",
(name, profile_id)
)
row = cursor.fetchone()
if row:
wl = dict(row)
for col in id_cols:
if wl.get(col) and col not in harvested_ids:
harvested_ids[col] = str(wl[col])
if _valid_image(wl.get('image_url')) and not best_image:
best_image = wl['image_url']
except Exception:
pass
# 3. Metadata cache (all sources)
try:
conn = database._get_connection()
cursor = conn.cursor()
cursor.execute(
"SELECT entity_id, source, image_url FROM metadata_cache_entities WHERE entity_type = 'artist' AND name = ? COLLATE NOCASE",
(name,)
)
for row in cursor.fetchall():
col = source_cols.get(row['source'])
if col and col not in harvested_ids:
harvested_ids[col] = row['entity_id']
if _valid_image(row['image_url']) and not best_image:
best_image = row['image_url']
except Exception:
pass
# --- API STRATEGIES (search each missing source) ---
# Same pattern as watchlist scanner's _backfill_missing_ids
for source, col in source_cols.items():
if col in harvested_ids:
continue # Already have this source's ID
client = search_clients.get(source)
if not client:
continue
if api_calls >= 200: # Hard cap per refresh cycle
break
try:
results = client.search_artists(name, limit=5)
best = _best_match(results, name)
if best:
harvested_ids[col] = best.id
if hasattr(best, 'image_url') and _valid_image(best.image_url) and not best_image:
best_image = best.image_url
api_calls += 1
time.sleep(0.4) # Rate limit breathing room
except Exception as e:
logger.debug(f"[Your Artists] {source} search failed for '{name}': {e}")
api_calls += 1
# Save all harvested IDs
if harvested_ids:
# Determine best active source/ID — prefer Spotify, then iTunes, Deezer, Discogs
resolved_source = None
resolved_id = None
for src in ('spotify', 'itunes', 'deezer', 'discogs'):
col = source_cols[src]
if col in harvested_ids:
resolved_source = src
resolved_id = harvested_ids[col]
break
database.update_liked_artist_match(
pool_id, active_source=resolved_source, active_source_id=resolved_id,
image_url=best_image, all_ids=harvested_ids
)
matched += 1
database.sync_liked_artists_watchlist_flags(profile_id)
logger.info(f"[Your Artists] Matched {matched}/{len(pending)} artists to {len(search_clients)} sources ({api_calls} API calls)")
# Image backfill: fetch images for matched artists that have IDs but no image
_backfill_liked_artist_images(database, profile_id, search_clients)
def _backfill_liked_artist_images(database, profile_id: int, search_clients: dict):
"""Fetch images for matched artists missing artwork using their stored source IDs."""
try:
conn = database._get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT id, artist_name, spotify_artist_id, itunes_artist_id, deezer_artist_id
FROM liked_artists_pool
WHERE profile_id = ? AND match_status = 'matched'
AND (image_url IS NULL OR image_url = ''
OR image_url LIKE '%2a96cbd8b46e442fc41c2b86b821562f%'
OR image_url NOT LIKE 'http%')
LIMIT 100
""", (profile_id,))
rows = cursor.fetchall()
if not rows:
return
logger.info(f"[Your Artists] Backfilling images for {len(rows)} artists...")
filled = 0
for row in rows:
r = dict(row)
image_url = None
# Try Spotify artist lookup (has best images)
if r.get('spotify_artist_id') and 'spotify' in search_clients:
try:
sp = search_clients['spotify']
if hasattr(sp, 'sp') and sp.sp:
artist_data = sp.sp.artist(r['spotify_artist_id'])
if artist_data and artist_data.get('images'):
image_url = artist_data['images'][0]['url']
except Exception:
pass
# Try Deezer (direct image URL from ID)
if not image_url and r.get('deezer_artist_id'):
image_url = f"https://api.deezer.com/artist/{r['deezer_artist_id']}/image?size=big"
if image_url:
try:
cursor2 = conn.cursor()
cursor2.execute(
"UPDATE liked_artists_pool SET image_url = ? WHERE id = ?",
(image_url, r['id'])
)
filled += 1
except Exception:
pass
time.sleep(0.3)
conn.commit()
if filled:
logger.info(f"[Your Artists] Backfilled {filled}/{len(rows)} artist images")
except Exception as e:
logger.debug(f"[Your Artists] Image backfill error: {e}")
from core.artists.liked_match import (
_backfill_liked_artist_images,
_match_liked_artists_to_all_sources,
)
# ── Your Albums (Liked Albums Pool) ──

Loading…
Cancel
Save