SoulSync/core/playlists/explorer.py

"""Playlist explorer build-tree route.

`playlist_explorer_build_tree(deps)` is the body of the
`POST /api/playlist-explorer/build-tree` route. It builds a discovery
tree from a mirrored playlist and streams the result as NDJSON
(one JSON object per artist line + a final 'complete' line).

Works with Spotify (preferred), iTunes, or Deezer as the metadata
source. Uses and populates the metadata cache to avoid redundant API
calls per discography fetch.

Two operating modes:
- `albums`: only show releases that overlap with the playlist's tracks.
- `discographies`: show the full discography of every artist in the
  playlist, with `in_playlist` flag on the matching releases.

Per-artist flow inside the streaming generator:
1. Resolve discography via `_fetch_artist_discography` (cache → fall
   through to live API search).
2. Tag each release with `in_playlist` based on title-similarity match
   against the playlist's track/album names.
3. Apply mode filter, sort by in-playlist-first then year DESC.
4. Yield one JSON line per artist.

The route returns Flask's streaming `Response` wrapper around the NDJSON
generator. Early-exit cases (bad request, playlist not found, top-level
exception) yield via Flask's standard `jsonify(...), status` shape.

Lifted verbatim from web_server.py. Wide dependency surface (Flask
`request` + `Response`, Spotify client, multiple metadata helpers,
DB access, metadata cache) all injected via `PlaylistExplorerDeps`.
"""

from __future__ import annotations

import json
import logging
import time
from dataclasses import dataclass
from typing import Any, Callable

logger = logging.getLogger(__name__)


@dataclass
class PlaylistExplorerDeps:
    """Bundle of cross-cutting deps the playlist explorer needs."""
    request: Any  # flask.request proxy
    flask_response: Any  # flask.Response constructor
    flask_jsonify: Any  # flask.jsonify
    spotify_client: Any
    get_database: Callable[[], Any]
    get_active_discovery_source: Callable[[], str]
    get_metadata_fallback_client: Callable[[], Any]
    get_metadata_fallback_source: Callable[[], str]
    get_metadata_cache: Callable[[], Any]


def playlist_explorer_build_tree(deps: PlaylistExplorerDeps):
    """Build a discovery tree from a mirrored playlist.
    Streams NDJSON: one line per artist with their albums.
    Works with Spotify, iTunes, or Deezer as the metadata source.
    Uses and populates the metadata cache to avoid redundant API calls."""
    try:
        data = deps.request.get_json()
        if not data:
            return deps.flask_jsonify({"success": False, "error": "No data provided"}), 400

        playlist_id = data.get('playlist_id')
        mode = data.get('mode', 'albums')  # 'albums' or 'discographies'

        if not playlist_id:
            return deps.flask_jsonify({"success": False, "error": "playlist_id is required"}), 400
        if mode not in ('albums', 'discographies'):
            return deps.flask_jsonify({"success": False, "error": "mode must be 'albums' or 'discographies'"}), 400

        database = deps.get_database()
        playlist = database.get_mirrored_playlist(playlist_id)
        if not playlist:
            return deps.flask_jsonify({"success": False, "error": "Playlist not found"}), 404

        tracks = database.get_mirrored_playlist_tracks(playlist_id)
        if not tracks:
            return deps.flask_jsonify({"success": False, "error": "Playlist has no tracks"}), 400

        # Determine active metadata source — respect user's configured primary
        source_name = deps.get_active_discovery_source()
        if source_name == 'spotify' and deps.spotify_client and deps.spotify_client.is_spotify_authenticated():
            active_client = deps.spotify_client
        else:
            active_client = deps.get_metadata_fallback_client()
            source_name = deps.get_metadata_fallback_source()

        cache = deps.get_metadata_cache()

        # Parse extra_data and group tracks by artist using discovered data
        artist_groups = {}
        for t in tracks:
            extra = {}
            if t.get('extra_data'):
                try:
                    extra = json.loads(t['extra_data']) if isinstance(t['extra_data'], str) else t['extra_data']
                except (json.JSONDecodeError, TypeError):
                    pass

            # Only use discovery data if it matches the active metadata source
            is_discovered = extra.get('discovered', False)
            provider = (extra.get('provider') or '').lower()
            source_matches = provider == source_name or (provider in ('itunes', 'apple') and source_name == 'itunes')

            matched = extra.get('matched_data', {}) if (is_discovered and source_matches) else {}
            artists_list = matched.get('artists', [])
            primary_artist = artists_list[0] if artists_list else None
            # Artists can be dicts {"name": "X", "id": "Y"} or plain strings "X"
            if isinstance(primary_artist, dict):
                artist_name = primary_artist.get('name') or (t.get('artist_name') or '').strip()
                artist_id = primary_artist.get('id') or None
            elif isinstance(primary_artist, str):
                artist_name = primary_artist or (t.get('artist_name') or '').strip()
                artist_id = None
            else:
                artist_name = (t.get('artist_name') or '').strip()
                artist_id = None

            if not artist_name:
                continue

            key = artist_name.lower()
            if key not in artist_groups:
                artist_groups[key] = {
                    'name': artist_name,
                    'artist_id': artist_id,  # Pre-resolved from discovery
                    'tracks': [],
                    'album_names': set(),
                    'discovered': extra.get('discovered', False),
                }
            # If we get an artist_id from a later track but didn't have one before, fill it in
            if artist_id and not artist_groups[key].get('artist_id'):
                artist_groups[key]['artist_id'] = artist_id

            artist_groups[key]['tracks'].append(t.get('track_name', ''))
            # Get album name from discovered data or playlist field
            album_name = ''
            album_data = matched.get('album')
            if isinstance(album_data, dict) and album_data.get('name'):
                album_name = album_data['name']
            elif (t.get('album_name') or '').strip():
                album_name = t['album_name'].strip()
            if album_name:
                artist_groups[key]['album_names'].add(album_name)

        def _normalize_for_match(title):
            import re
            return re.sub(r'\s*[\(\[][^)\]]*[\)\]]', '', title).strip().lower()

        def _fetch_artist_discography(artist_name, known_artist_id=None):
            """Fetch discography using the active client. Checks cache first, stores results after.
            If known_artist_id is provided (from discovery cache), skips the name search."""
            # Check cache for this artist's discography
            cache_key = f"explorer_disco_{artist_name.lower().strip()}"
            cached = cache.get_entity(source_name, 'artist_discography', cache_key) if cache else None
            if cached and isinstance(cached, dict) and cached.get('albums'):
                logger.debug(f"Explorer: cache hit for '{artist_name}' discography")
                return cached

            artist_id = known_artist_id
            artist_image = None

            if artist_id:
                # Already have the ID from discovery — just fetch the artist image
                try:
                    artist_info = active_client.get_artist(artist_id)
                    if artist_info:
                        if isinstance(artist_info, dict):
                            images = artist_info.get('images') or []
                            artist_image = images[0].get('url') if images else None
                        elif hasattr(artist_info, 'image_url'):
                            artist_image = artist_info.image_url
                except Exception:
                    pass
            else:
                # No pre-resolved ID — search by name
                try:
                    search_results = active_client.search_artists(artist_name, limit=5)
                except Exception as e:
                    return {'success': False, 'error': f'Search failed: {e}'}

                if not search_results:
                    return {'success': False, 'error': f'"{artist_name}" not found'}

                # Find best match (exact first, then fuzzy)
                best = None
                for a in search_results:
                    if a.name.lower().strip() == artist_name.lower().strip():
                        best = a
                        break
                if not best:
                    best = search_results[0]

                artist_id = best.id
                artist_image = best.image_url if hasattr(best, 'image_url') else None

            # Fetch albums
            try:
                # skip_cache only supported by spotify_client — other clients don't cache this call
                _skip = {'skip_cache': True} if hasattr(active_client, 'sp') else {}
                all_albums = active_client.get_artist_albums(artist_id, album_type='album,single', **_skip)
            except Exception as e:
                return {'success': False, 'error': f'Album fetch failed: {e}'}

            if not all_albums:
                return {'success': False, 'error': 'No albums found'}

            # Check which albums the user already owns
            owned_titles = set()
            try:
                db = deps.get_database()
                with db._get_connection() as conn:
                    cursor = conn.cursor()
                    # Find all artists in DB matching this name
                    cursor.execute("SELECT id FROM artists WHERE LOWER(name) = LOWER(?)", (artist_name,))
                    artist_rows = cursor.fetchall()
                    for ar in artist_rows:
                        cursor.execute("SELECT title FROM albums WHERE artist_id = ?", (ar['id'],))
                        for alb_row in cursor.fetchall():
                            owned_titles.add((alb_row['title'] or '').strip().lower())
            except Exception:
                pass  # Non-critical — owned badges just won't show

            # Build release list
            releases = []
            for album in all_albums:
                # Skip albums where this artist isn't primary
                if hasattr(album, 'artist_ids') and album.artist_ids and album.artist_ids[0] != artist_id:
                    continue
                releases.append({
                    'title': album.name,
                    'year': album.release_date[:4] if album.release_date else None,
                    'image_url': album.image_url,
                    'spotify_id': album.id,
                    'track_count': album.total_tracks,
                    'album_type': (album.album_type or 'album').lower(),
                    'owned': (album.name or '').strip().lower() in owned_titles,
                })

            result = {
                'success': True,
                'name': artist_name,  # Required for metadata cache validation
                'albums': releases,
                'artist_image': artist_image,
                'artist_id': artist_id,
                'artist_name': artist_name,
            }

            # Store in cache
            if cache and releases:
                try:
                    cache.store_entity(source_name, 'artist_discography', cache_key, result)
                except Exception:
                    pass

            return result

        def generate():
            yield json.dumps({
                "type": "meta",
                "playlist_name": playlist.get('name', 'Unknown Playlist'),
                "playlist_image": playlist.get('image_url', ''),
                "total_artists": len(artist_groups),
                "total_tracks": len(tracks),
                "source": source_name,
            }) + '\n'

            total_albums = 0

            for idx, (_key, group) in enumerate(artist_groups.items()):
                artist_name = group['name']
                playlist_track_names = group['tracks']
                playlist_album_names = group['album_names']

                try:
                    disco = _fetch_artist_discography(artist_name, group.get('artist_id'))

                    if not disco.get('success'):
                        yield json.dumps({
                            "type": "artist",
                            "name": artist_name,
                            "artist_id": None,
                            "image_url": None,
                            "playlist_tracks": playlist_track_names,
                            "albums": [],
                            "error": disco.get('error', 'Not found'),
                        }) + '\n'
                        time.sleep(0.1)
                        continue

                    # Tag each release with in_playlist flag
                    # If no album names available, fall back to matching track names against single titles
                    match_names = playlist_album_names
                    if not match_names:
                        match_names = set(playlist_track_names)

                    all_releases = []
                    for release in disco.get('albums', []):
                        r = dict(release)
                        norm_title = _normalize_for_match(r['title'])
                        r['in_playlist'] = any(
                            _normalize_for_match(a) == norm_title or
                            norm_title in _normalize_for_match(a) or
                            _normalize_for_match(a) in norm_title
                            for a in match_names
                        )
                        all_releases.append(r)

                    # Filter based on mode
                    if mode == 'albums':
                        filtered = [r for r in all_releases if r['in_playlist']]
                    else:
                        filtered = all_releases

                    filtered.sort(key=lambda r: (not r.get('in_playlist', False), -(int(r.get('year') or 0))))
                    total_albums += len(filtered)

                    yield json.dumps({
                        "type": "artist",
                        "name": disco.get('artist_name', artist_name),
                        "artist_id": disco.get('artist_id'),
                        "image_url": disco.get('artist_image'),
                        "playlist_tracks": playlist_track_names,
                        "albums": filtered,
                    }) + '\n'

                except Exception as e:
                    logger.error(f"Explorer: error processing artist '{artist_name}': {e}")
                    yield json.dumps({
                        "type": "artist",
                        "name": artist_name,
                        "artist_id": None,
                        "image_url": None,
                        "playlist_tracks": playlist_track_names,
                        "albums": [],
                        "error": str(e),
                    }) + '\n'

                # Rate limit protection between artists
                if idx < len(artist_groups) - 1:
                    time.sleep(0.2)

            deps.get_database().mark_mirrored_playlist_explored(playlist_id)
            yield json.dumps({"type": "complete", "total_artists": len(artist_groups), "total_albums": total_albums}) + '\n'

        return deps.flask_response(generate(), mimetype='application/x-ndjson', headers={
            'Cache-Control': 'no-cache',
            'X-Accel-Buffering': 'no',
        })

    except Exception as e:
        logger.error(f"Playlist Explorer build-tree error: {e}")
        import traceback
        traceback.print_exc()
        return deps.flask_jsonify({"success": False, "error": str(e)}), 500