SoulSync/core/musicbrainz_client.py

import requests
import time
import threading
from typing import Dict, List, Optional, Any
from functools import wraps
from utils.logging_config import get_logger

logger = get_logger("musicbrainz_client")

# Global rate limiting variables
_last_api_call_time = 0
_api_call_lock = threading.Lock()
MIN_API_INTERVAL = 1.0  # 1 second between API calls (MusicBrainz requirement)

def rate_limited(func):
    """Decorator to enforce rate limiting on MusicBrainz API calls"""
    @wraps(func)
    def wrapper(*args, **kwargs):
        global _last_api_call_time

        with _api_call_lock:
            current_time = time.time()
            time_since_last_call = current_time - _last_api_call_time

            if time_since_last_call < MIN_API_INTERVAL:
                sleep_time = MIN_API_INTERVAL - time_since_last_call
                time.sleep(sleep_time)

            _last_api_call_time = time.time()

        from core.api_call_tracker import api_call_tracker
        api_call_tracker.record_call('musicbrainz')

        try:
            result = func(*args, **kwargs)
            return result
        except Exception as e:
            # Implement exponential backoff for API errors
            if "rate limit" in str(e).lower() or "503" in str(e):
                logger.warning(f"MusicBrainz rate limit hit, implementing backoff: {e}")
                time.sleep(2.0)  # Wait 2 seconds before retrying
            raise e
    return wrapper

class MusicBrainzClient:
    """Client for interacting with MusicBrainz API"""

    BASE_URL = "https://musicbrainz.org/ws/2"
    # MusicBrainz mandates a meaningful User-Agent with contact info. Falling back
    # to a bare name/version risks IP blocking under load — include the project
    # URL so MB operators have a way to reach us if we misbehave.
    DEFAULT_CONTACT = "https://github.com/Nezreka/SoulSync"

    def __init__(self, app_name: str = "SoulSync", app_version: str = "1.0", contact_email: str = ""):
        """
        Initialize MusicBrainz client

        Args:
            app_name: Name of the application
            app_version: Version of the application
            contact_email: Contact email or URL (defaults to project URL when empty)
        """
        contact = contact_email or self.DEFAULT_CONTACT
        self.user_agent = f"{app_name}/{app_version} ( {contact} )"

        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': self.user_agent,
            'Accept': 'application/json'
        })

        logger.info(f"MusicBrainz client initialized with user agent: {self.user_agent}")

    @rate_limited
    def search_artist(self, artist_name: str, limit: int = 10, strict: bool = True) -> List[Dict[str, Any]]:
        """
        Search for artists by name.

        Args:
            artist_name: Name of the artist to search for
            limit: Maximum number of results to return
            strict: When True (default), builds a phrase-match query against
                the `artist` field only — correct for enrichment flows that
                already know the exact name. When False, sends a bare query
                which MusicBrainz matches against the alias, artist, AND
                sortname indexes — the right behavior for user-facing fuzzy
                search (finds "Metallica" from typing "metalica", matches
                aliased names, etc.).

        Returns:
            List of artist results with id, name, score, etc. MusicBrainz
            assigns each result a `score` 0-100; the list is pre-sorted
            score-descending by the server.
        """
        try:
            # Escape quotes and backslashes for Lucene query
            safe_name = artist_name.replace('\\', '\\\\').replace('"', '\\"')

            if strict:
                query = f'artist:"{safe_name}"'
            else:
                # Bare query hits alias/artist/sortname indexes — much better
                # recall for user typing. Still Lucene-escaped via the API's
                # query parser.
                query = safe_name

            params = {
                'query': query,
                'fmt': 'json',
                'limit': limit
            }

            response = self.session.get(
                f"{self.BASE_URL}/artist",
                params=params,
                timeout=10
            )
            response.raise_for_status()

            data = response.json()
            artists = data.get('artists', [])

            logger.debug(f"Found {len(artists)} artists for query: {artist_name}")
            return artists

        except Exception as e:
            logger.error(f"Error searching for artist '{artist_name}': {e}")
            return []

    @rate_limited
    def search_release(self, album_name: str, artist_name: Optional[str] = None,
                       limit: int = 10, strict: bool = True) -> List[Dict[str, Any]]:
        """
        Search for releases (albums) by name.

        Args:
            album_name: Name of the album to search for
            artist_name: Optional artist name to narrow search
            limit: Maximum number of results to return
            strict: When True (default), builds a phrase-match Lucene query
                against the `release` and `artist` fields — correct for
                enrichment flows where exact name+artist are known. When
                False, sends a bare query (album + artist joined) so MB
                hits alias / sortname indexes and folds diacritics,
                dramatically improving recall for user-facing fuzzy
                lookups (e.g. the manual Fix popup).

        Returns:
            List of release results
        """
        try:
            if strict:
                # Escape quotes and backslashes for Lucene query
                safe_album = album_name.replace('\\', '\\\\').replace('"', '\\"')
                query = f'release:"{safe_album}"'

                if artist_name:
                    safe_artist = artist_name.replace('\\', '\\\\').replace('"', '\\"')
                    query += f' AND artist:"{safe_artist}"'
            else:
                # Bare query — MB tokenizes against title + artist credit +
                # alias + sortname indexes together with diacritic folding.
                # Recovers cases like "Bjork" → "Björk" that strict phrase
                # queries miss.
                parts = [album_name]
                if artist_name:
                    parts.append(artist_name)
                query = ' '.join(p for p in parts if p)

            params = {
                'query': query,
                'fmt': 'json',
                'limit': limit
            }

            response = self.session.get(
                f"{self.BASE_URL}/release",
                params=params,
                timeout=10
            )
            response.raise_for_status()

            data = response.json()
            releases = data.get('releases', [])

            logger.debug(f"Found {len(releases)} releases for query: {album_name}")
            return releases

        except Exception as e:
            logger.error(f"Error searching for release '{album_name}': {e}")
            return []

    @rate_limited
    def search_recording(self, track_name: str, artist_name: Optional[str] = None,
                         limit: int = 10, strict: bool = True) -> List[Dict[str, Any]]:
        """
        Search for recordings (tracks) by name.

        Args:
            track_name: Name of the track to search for
            artist_name: Optional artist name to narrow search
            limit: Maximum number of results to return
            strict: When True (default), builds a phrase-match Lucene query
                against the `recording` and `artist` fields — correct for
                enrichment flows where exact name+artist are known. When
                False, sends a bare query (track + artist joined) so MB
                hits alias / sortname indexes and folds diacritics. The
                bare path also avoids the AND-clause that kills recall
                when either side mis-matches (e.g. "Bjork" vs canonical
                "Björk", or a track title with bracketed suffix like
                "(Live)" that strict phrase match rejects).

        Returns:
            List of recording results
        """
        try:
            if strict:
                # Escape quotes and backslashes for Lucene query
                safe_track = track_name.replace('\\', '\\\\').replace('"', '\\"')
                query = f'recording:"{safe_track}"'

                if artist_name:
                    safe_artist = artist_name.replace('\\', '\\\\').replace('"', '\\"')
                    query += f' AND artist:"{safe_artist}"'
            else:
                # Bare query — see search_release for rationale.
                parts = [track_name]
                if artist_name:
                    parts.append(artist_name)
                query = ' '.join(p for p in parts if p)

            params = {
                'query': query,
                'fmt': 'json',
                'limit': limit
            }

            response = self.session.get(
                f"{self.BASE_URL}/recording",
                params=params,
                timeout=10
            )
            response.raise_for_status()

            data = response.json()
            recordings = data.get('recordings', [])

            logger.debug(f"Found {len(recordings)} recordings for query: {track_name}")
            return recordings

        except Exception as e:
            logger.error(f"Error searching for recording '{track_name}': {e}")
            return []

    @rate_limited
    def browse_artist_release_groups(self, artist_mbid: str,
                                     release_types: Optional[List[str]] = None,
                                     limit: int = 100,
                                     offset: int = 0) -> List[Dict[str, Any]]:
        """Browse release-groups linked to an artist MBID.

        This is the correct MusicBrainz pattern for "give me this artist's
        discography" — text-based `/release?query=...` search would look at
        release TITLES (matching unrelated releases literally titled after
        the artist name), while browse walks the artist→release-group link
        directly.

        Args:
            artist_mbid: Artist's MusicBrainz ID
            release_types: Filter by primary type — any of 'album', 'single',
                'ep', 'compilation', 'soundtrack', 'live', etc. Combined with
                `|` per MB spec, e.g. `['album', 'ep']` → `type=album|ep`.
                None returns all types.
            limit: 1-100 (MB hard cap)
            offset: Pagination offset

        Returns:
            List of release-group dicts. Each has `id`, `title`, `primary-type`,
            `secondary-types`, `first-release-date`, `disambiguation`.
        """
        try:
            params = {'artist': artist_mbid, 'fmt': 'json', 'limit': min(limit, 100), 'offset': offset}
            if release_types:
                params['type'] = '|'.join(release_types)

            response = self.session.get(
                f"{self.BASE_URL}/release-group",
                params=params,
                timeout=10
            )
            response.raise_for_status()

            data = response.json()
            rgs = data.get('release-groups', [])
            logger.debug(f"Browsed {len(rgs)} release-groups for artist {artist_mbid}")
            return rgs
        except Exception as e:
            logger.error(f"Error browsing release-groups for artist {artist_mbid}: {e}")
            return []

    @rate_limited
    def search_recordings_by_artist_mbid(self, artist_mbid: str,
                                         limit: int = 100) -> List[Dict[str, Any]]:
        """Search for recordings linked to an artist via Lucene `arid:` query.

        This is the counterpart to `browse_artist_release_groups` for tracks.
        The proper "browse" endpoint (`/recording?artist=<mbid>`) rejects
        `inc=releases`, so we can't get album context per recording from
        browse — only the track title/length/MBID. Without release info the
        user would see tracks with no album, which is useless.

        The search endpoint with a fielded `arid:<mbid>` query returns
        recordings with the `releases` array already embedded (including
        release-group, date, and media info), which is what the search-tab
        UI needs.

        Args:
            artist_mbid: Artist's MusicBrainz ID
            limit: 1-100 (MB hard cap)

        Returns:
            List of recording dicts with `id`, `title`, `length`, `score`,
            `artist-credit`, and `releases` (each with release-group + date).
        """
        try:
            params = {
                'query': f'arid:{artist_mbid}',
                'fmt': 'json',
                'limit': min(limit, 100),
            }

            response = self.session.get(
                f"{self.BASE_URL}/recording",
                params=params,
                timeout=10
            )
            response.raise_for_status()

            data = response.json()
            recs = data.get('recordings', [])
            logger.debug(f"Found {len(recs)} recordings for artist {artist_mbid}")
            return recs
        except Exception as e:
            logger.error(f"Error searching recordings for artist {artist_mbid}: {e}")
            return []

    @rate_limited
    def get_artist(self, mbid: str, includes: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
        """
        Get full artist details by MusicBrainz ID

        Args:
            mbid: MusicBrainz ID of the artist
            includes: Optional list of additional data to include (e.g., 'url-rels', 'genres')

        Returns:
            Artist data or None if not found
        """
        try:
            params = {'fmt': 'json'}
            if includes:
                params['inc'] = '+'.join(includes)

            response = self.session.get(
                f"{self.BASE_URL}/artist/{mbid}",
                params=params,
                timeout=10
            )
            response.raise_for_status()

            return response.json()

        except Exception as e:
            logger.error(f"Error fetching artist {mbid}: {e}")
            return None

    @rate_limited
    def get_release(self, mbid: str, includes: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
        """
        Get full release details by MusicBrainz ID

        Args:
            mbid: MusicBrainz ID of the release
            includes: Optional list of additional data to include

        Returns:
            Release data or None if not found
        """
        try:
            params = {'fmt': 'json'}
            if includes:
                params['inc'] = '+'.join(includes)

            response = self.session.get(
                f"{self.BASE_URL}/release/{mbid}",
                params=params,
                timeout=10
            )
            response.raise_for_status()

            return response.json()

        except Exception as e:
            logger.error(f"Error fetching release {mbid}: {e}")
            return None

    @rate_limited
    def get_release_group(self, mbid: str, includes: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
        """Get full release-group details by MBID.

        Release-groups are the 'canonical album' entity in MusicBrainz —
        they group every edition/reissue/region-specific release of the
        same logical album under one MBID. Use `inc=releases` to list the
        individual releases this group contains (each with its own
        tracklist); use `inc=artist-credits` for artist info.

        Args:
            mbid: Release-group's MusicBrainz ID
            includes: Optional list, e.g. ['releases', 'artist-credits']

        Returns:
            Release-group data or None if not found.
        """
        try:
            params = {'fmt': 'json'}
            if includes:
                params['inc'] = '+'.join(includes)
            response = self.session.get(
                f"{self.BASE_URL}/release-group/{mbid}",
                params=params,
                timeout=10
            )
            response.raise_for_status()
            return response.json()
        except Exception as e:
            logger.error(f"Error fetching release-group {mbid}: {e}")
            return None

    @rate_limited
    def get_recording(self, mbid: str, includes: Optional[List[str]] = None) -> Optional[Dict[str, Any]]:
        """
        Get full recording details by MusicBrainz ID

        Args:
            mbid: MusicBrainz ID of the recording
            includes: Optional list of additional data to include

        Returns:
            Recording data or None if not found
        """
        try:
            params = {'fmt': 'json'}
            if includes:
                params['inc'] = '+'.join(includes)

            response = self.session.get(
                f"{self.BASE_URL}/recording/{mbid}",
                params=params,
                timeout=10
            )
            response.raise_for_status()

            return response.json()

        except Exception as e:
            logger.error(f"Error fetching recording {mbid}: {e}")
            return None