Improve YouTube download handling and shutdown safety

Adjusts matching weights for YouTube sources to rely more on title and duration, adds a shutdown callback to the YouTube client to prevent new downloads during shutdown, and enhances post-processing to reliably resolve actual YouTube file paths. Improves error handling for file removal, ensures no new batch downloads start during shutdown, and refines download monitoring to trigger post-processing on completed YouTube downloads. Also increases YouTube download retries and improves logging for debugging.
4 months ago · ddd2ebdb13
parent af2265a68a
commit ddd2ebdb13
4 changed files with 155 additions and 15 deletions
--- a/core/matching_engine.py
+++ b/core/matching_engine.py
@ -572,13 +572,21 @@ class MusicMatchingEngine:
        is_short_title = len(spotify_cleaned_title) <= 5 or len(title_words) == 1

        # --- Final Weighted Score ---
-        # Rebalanced weights: Artist matching is now more important to prevent false positives
-        final_confidence = (title_score * 0.45) + (artist_score * 0.40) + (duration_score * 0.15)
+        is_youtube = slskd_track.username == 'youtube'
+        
+        if is_youtube:
+            # For YouTube, rely more on Title and Duration since Artist is often missing from video titles
+            # and the search query already filtered by artist to some extent.
+            # New weights: Title 70%, Artist 10%, Duration 20%
+            final_confidence = (title_score * 0.70) + (artist_score * 0.10) + (duration_score * 0.20)
+        else:
+            # Standard weights for Soulseek (Artist is critical for correctness)
+            # Rebalanced weights: Artist matching is now more important to prevent false positives
+            final_confidence = (title_score * 0.45) + (artist_score * 0.40) + (duration_score * 0.15)

        # Apply short title penalty AFTER calculating base confidence
        # This allows perfect matches to still pass, but penalizes weak artist matches
        # For YouTube, skip penalty since artist matching is less reliable (searches are track-name-only)
-        is_youtube = slskd_track.username == 'youtube'
        if is_short_title and artist_score < 0.5 and not is_youtube:
            # Heavy penalty but not complete rejection
            # Multiply by 0.4 (60% penalty) - still possible to pass if title+duration are perfect
@ -596,13 +604,13 @@ class MusicMatchingEngine:
        # Debug logging to track matching decisions
        if final_confidence > 0.3:  # Only log potential matches
            logger.debug(
-                f"Match scoring: '{spotify_track.name}' by {spotify_track.artists[0] if spotify_track.artists else 'Unknown'} "
+                f"Match scoring ({'YT' if is_youtube else 'SLSK'}): '{spotify_track.name}' by {spotify_track.artists[0] if spotify_track.artists else 'Unknown'} "
                f"vs '{slskd_track.filename[:60]}...' | "
                f"Title: {title_score:.2f} (ratio: {title_ratio:.2f}, boundary: {has_word_boundary}), "
                f"Artist: {artist_score:.2f}, Duration: {duration_score:.2f}, "
-                f"Final: {final_confidence:.2f} {'✅ PASS' if final_confidence > 0.58 else '❌ FAIL'}"
+                f"Final: {final_confidence:.2f} {'✅ PASS' if final_confidence > 0.63 else '❌ FAIL'}"
            )
-
+        
        # Ensure the final score doesn't exceed 1.0
        return min(final_confidence, 1.0)

--- a/core/soulseek_client.py
+++ b/core/soulseek_client.py
@ -198,6 +198,7 @@ class DownloadStatus:
    transferred: int
    speed: int
    time_remaining: Optional[int] = None
+    file_path: Optional[str] = None

 class SoulseekClient:
    def __init__(self):
@ -330,8 +331,9 @@ class SoulseekClient:
                    error_detail = response_text if response_text.strip() else "No error details provided"
                    
                    # Reduce noise for expected 404s during search cleanup
-                    if response.status == 404 and 'searches/' in url and method == 'DELETE':
-                        logger.debug(f"Search not found for deletion (expected): {url}")
+                    # Reduce noise for expected 404s (e.g. status checks for YouTube downloads)
+                    if response.status == 404:
+                        logger.debug(f"API request returned 404 (Not Found) for {url}")
                    else:
                        logger.error(f"API request failed: HTTP {response.status} ({response.reason}) - {error_detail}")
                        logger.debug(f"Failed request: {method} {url}")
--- a/core/youtube_client.py
+++ b/core/youtube_client.py
@ -108,6 +108,13 @@ class YouTubeClient:

        logger.info(f"📁 YouTube client using download path: {self.download_path}")

+        # Callback for shutdown check (avoids circular imports)
+        self.shutdown_check = None
+
+    def set_shutdown_check(self, check_callable):
+        """Set a callback function to check for system shutdown"""
+        self.shutdown_check = check_callable
+
        # Initialize production matching engine for parity with Soulseek
        self.matching_engine = MusicMatchingEngine()
        logger.info("✅ Initialized production MusicMatchingEngine")
@ -893,21 +900,35 @@ class YouTubeClient:
            File path if successful, None otherwise
        """
        try:
-            max_retries = 2
+            max_retries = 3
            for attempt in range(max_retries):
+                # Check for server shutdown using callback
+                if self.shutdown_check and self.shutdown_check():
+                    logger.info(f"🛑 Server shutting down, aborting download attempt {attempt + 1}")
+                    return None
+
                try:
                    # Use default download options
                    download_opts = self.download_opts.copy()
+                    
+                    # Force best audio format to prevent 'Requested format not available' errors
+                    download_opts['format'] = 'bestaudio/best'
+                    download_opts['noplaylist'] = True

                    # On retry, try different player client
-                    if attempt > 0:
-                        logger.info(f"🔄 Retry {attempt + 1}/{max_retries} with different settings")
+                    if attempt == 1:
+                        logger.info(f"🔄 Retry {attempt + 1}/{max_retries} with different player client")
                        download_opts['extractor_args'] = {
                            'youtube': {
                                'player_client': ['web'],  # Try web-only on retry
                                'skip': ['hls', 'dash'],
                            }
                        }
+                    elif attempt >= 2:
+                        logger.info(f"🔄 Retry {attempt + 1}/{max_retries} with 'best' format (video fallback)")
+                        download_opts['format'] = 'best'  # Fallback to best available (including video)
+                        download_opts.pop('extractor_args', None) # Reset extractor args
+

                    # Perform download
                    with yt_dlp.YoutubeDL(download_opts) as ydl:
@ -1004,7 +1025,8 @@ class YouTubeClient:
                size=download_info['size'],
                transferred=download_info['transferred'],
                speed=download_info['speed'],
-                time_remaining=download_info.get('time_remaining')
+                time_remaining=download_info.get('time_remaining'),
+                file_path=download_info.get('file_path')
            )

    async def cancel_download(self, download_id: str, username: str = None, remove: bool = False) -> bool:
--- a/web_server.py
+++ b/web_server.py
@ -120,6 +120,12 @@ try:
    tidal_client = TidalClient()
    matching_engine = MusicMatchingEngine()
    sync_service = PlaylistSyncService(spotify_client, plex_client, soulseek_client, jellyfin_client, navidrome_client)
+    
+    # Inject shutdown check callback into YouTube client (avoids circular imports)
+    # The callback uses the global IS_SHUTTING_DOWN flag from this module
+    if hasattr(soulseek_client, 'youtube'):
+         soulseek_client.youtube.set_shutdown_check(lambda: IS_SHUTTING_DOWN)
+         print("✅ Configured YouTube client shutdown callback")

    # Initialize web scan manager for automatic post-download scanning
    media_clients = {
@ -499,6 +505,22 @@ class WebUIDownloadMonitor:
                        
                    # Check for timeouts and errors - retries handled directly in _should_retry_task
                    self._should_retry_task(task_id, task, live_transfers_lookup, current_time)
+
+                    # ENHANCED: Check for successful completions (especially YouTube)
+                    task_filename = task.get('filename') or task.get('track_info', {}).get('filename')
+                    task_username = task.get('username') or task.get('track_info', {}).get('username')
+                    
+                    if task_filename and task_username:
+                        lookup_key = f"{task_username}::{extract_filename(task_filename)}"
+                        live_info = live_transfers_lookup.get(lookup_key)
+                        
+                        if live_info:
+                            state = live_info.get('state', '')
+                            # Trigger post-processing if download is completed but still marked as downloading locally
+                            # 'Completed' is used by YouTubeClient, 'Succeeded' by Soulseek
+                            if state in ['Completed', 'Succeeded'] and task['status'] == 'downloading':
+                                print(f"✅ Monitor detected completed download for {task_id} ({state}) - triggering post-processing")
+                                _on_download_completed(task_id, success=True)
                
        # ENHANCED: Add worker count validation to detect ghost workers
        self._validate_worker_counts()
@ -1008,10 +1030,23 @@ def cleanup_monitor():
        batch_locks.clear()
        print("🧹 Cleaned up batch locks")

+# Global shutdown flag
+IS_SHUTTING_DOWN = False
+
 def signal_handler(signum, frame):
    """Handle SIGINT (Ctrl+C) and SIGTERM"""
+    global IS_SHUTTING_DOWN
    print(f"🛑 Signal {signum} received, cleaning up...")
+    IS_SHUTTING_DOWN = True
    cleanup_monitor()
+    
+    # Shutdown executor to prevent new tasks
+    try:
+        print("🛑 Shutting down missing_download_executor...")
+        missing_download_executor.shutdown(wait=False, cancel_futures=True)
+    except Exception as e:
+        print(f"⚠️ Error shutting down executor: {e}")
+
    sys.exit(0)

 # Register cleanup handlers
@ -8212,14 +8247,26 @@ def _post_process_matched_download(context_key, context, file_path):
                if has_metadata:
                    print(f"⚠️ [Protection] Existing file already has metadata enhancement - skipping overwrite: {os.path.basename(final_path)}")
                    print(f"🗑️ [Protection] Removing redundant download file: {os.path.basename(file_path)}")
-                    os.remove(file_path)  # Remove the redundant file
+                    try:
+                        os.remove(file_path)  # Remove the redundant file
+                    except FileNotFoundError:
+                        print(f"⚠️ [Protection] Could not remove redundant file (already gone): {file_path}")
+                    except Exception as e:
+                        print(f"⚠️ [Protection] Error removing redundant file: {e}")
                    return  # Don't overwrite the good file
                else:
                    print(f"🔄 [Protection] Existing file lacks metadata - safe to overwrite: {os.path.basename(final_path)}")
-                    os.remove(final_path)
+                    try:
+                        os.remove(final_path)
+                    except FileNotFoundError:
+                        pass # It was just there, but now gone?
            except Exception as check_error:
                print(f"⚠️ [Protection] Error checking existing file metadata, proceeding with overwrite: {check_error}")
-                os.remove(final_path)
+                try:
+                    if os.path.exists(final_path):
+                        os.remove(final_path)
+                except Exception as e:
+                    print(f"⚠️ [Protection] Failed to remove existing file for overwrite: {e}")

        _safe_move_file(file_path, final_path)

@ -10684,6 +10731,11 @@ def get_valid_candidates(results, spotify_track, query):
    normalized_spotify_artist = re.sub(r'[^a-zA-Z0-9]', '', spotify_artist_name).lower()

    for candidate in quality_filtered_candidates:
+        # Skip artist check for YouTube results (title matching is sufficient as processed by matching engine)
+        if is_youtube_source:
+            verified_candidates.append(candidate)
+            continue
+            
        # This check is critical: it ensures the artist's name is in the file path,
        # preventing downloads from the wrong artist.
        normalized_slskd_path = re.sub(r'[^a-zA-Z0-9]', '', candidate.filename).lower()
@ -10757,6 +10809,11 @@ def _start_next_batch_of_downloads(batch_id):
    batch_lock = _get_batch_lock(batch_id)
    
    with batch_lock:
+        # Prevent starting new tasks if shutting down
+        if IS_SHUTTING_DOWN:
+            print(f"🛑 [Batch Manager] Server shutting down - skipping new tasks for batch {batch_id}")
+            return
+
        with tasks_lock:
            if batch_id not in download_batches:
                return
@ -11640,7 +11697,58 @@ def _run_post_processing_worker(task_id, batch_id):
        # RESILIENT FILE-FINDING LOOP: Try up to 3 times with delays
        found_file = None
        file_location = None
+
+        # CRITICAL FIX: For YouTube downloads, the filename in task is 'id||title' (metadata),
+        # but the actual file on disk is 'Title.mp3'. We must ask the client for the real path.
+        if (task.get('username') == 'youtube' or '||' in str(task_filename)) and not found_file:
+            logger.info(f"🔧 [Post-Processing] Detected YouTube download task: {task_id}")
+            try:
+                # Query the download orchestrator for the status which contains the real file path
+                # CRITICAL FIX: Use the actual download_id designated by the client, not the internal task_id
+                actual_download_id = task.get('download_id') or task_id
+                status = asyncio.run(soulseek_client.get_download_status(actual_download_id))
+                if status and status.file_path:
+                    real_path = status.file_path
+                    if os.path.exists(real_path):
+                        # Determine if it's in download or transfer directory
+                        real_path_obj = Path(real_path)
+                        download_dir_obj = Path(download_dir)
+                        transfer_dir_obj = Path(transfer_dir)
+                        
+                        # Use absolute path comparison
+                        try:
+                            if download_dir_obj.resolve() in real_path_obj.resolve().parents:
+                                file_location = 'download'
+                            elif transfer_dir_obj.resolve() in real_path_obj.resolve().parents:
+                                file_location = 'transfer'
+                            else:
+                                file_location = 'absolute'
+                        except:
+                            # Fallback if resolve fails (e.g. permission or path issues)
+                            file_location = 'absolute'
+
+                        if file_location:
+                            # We found the file! Use the absolute path if it confuses the joining logic,
+                            # but usually we want just the filename if location is 'download'/'transfer'
+                            # CRITICAL FIX: Always use the absolute real_path.
+                            # Stripping to basename causes FileNotFoundError because post-processing
+                            # runs with CWD as project root, not download dir.
+                            found_file = real_path
+
+                            logger.info(f"✅ [Post-Processing] Resolved actual YouTube filename: {found_file} (Location: {file_location})")
+                    else:
+                        logger.warning(f"⚠️ [Post-Processing] YouTube status reported path but file missing: {real_path}")
+                else:
+                    logger.warning(f"⚠️ [Post-Processing] YouTube status returned no file_path for task {task_id}")
+            except Exception as e:
+                logger.error(f"⚠️ [Post-Processing] Failed to retrieve YouTube task status: {e}")
+
        for retry_count in range(3):
+            # If we already resolved the file (e.g. via YouTube status), skip searching
+            if found_file:
+                print(f"🎯 [Post-Processing] Skipping search loop, file already resolved: {found_file}")
+                break
+
            print(f"🔍 [Post-Processing] Attempt {retry_count + 1}/3 to find file")
            print(f"🔍 [Post-Processing] Original filename: {task_basename}")
            if expected_final_filename: