From 70e1750948326a3587e0554fcad8d1e9cbafe08b Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Fri, 8 May 2026 15:28:51 -0700 Subject: [PATCH] Stop docker image bloat from auto-downloaded ffmpeg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kettui reported the dev image roughly doubled in size after a recent nightly build. codex investigation traced it back to: 1. nightly workflow runs `python -m pytest` before docker build 2. one of the new tests imports web_server (test_tidal_auth_instructions.py) 3. importing web_server constructs YouTubeClient 4. YouTubeClient.__init__ called _check_ffmpeg() — which auto-downloads a ~388 MB ffmpeg/ffprobe bundle into ./tools/ when system ffmpeg isn't on PATH (CI runner doesn't have it) 5. .dockerignore didn't exclude tools/ffmpeg or tools/ffprobe 6. docker `COPY . .` shipped the binaries 7. the immediately-following `chown -R /app` rewrote every file into a new layer — so the 388 MB payload got counted twice in image size three fixes: 1. .dockerignore — block the auto-downloaded binaries even if they leak into the workspace (tools/ffmpeg, tools/ffprobe, .exe variants, .zip and .tar.xz download archives). Defense-in-depth so a future regression in the test/import path can't bloat the image again. 2. youtube_client — split _check_ffmpeg into a side-effect-free _locate_ffmpeg (pure existence check) and the original auto- download _check_ffmpeg. __init__ now calls _locate_ffmpeg + logs a warning when missing instead of triggering download. is_available() and the actual download dispatch paths still call _check_ffmpeg — so end users still get auto-download on first YouTube use, but `import web_server` doesn't drag a 388 MB binary into the workspace. 3. Dockerfile — replaced `COPY . .` + `chown -R /app` with `COPY --chown=soulsync:soulsync . .` + a scoped chown on just the runtime mount-point dirs. eliminates the layer that duplicated the entire /app tree just to flip ownership bits, so even legit workspace content isn't double-counted in the image. Combined effect: image size returns to baseline + future ffmpeg leaks can't bloat it. Inside the container nothing changes — the Dockerfile already installs system ffmpeg via apt, so YouTube downloads find it on PATH on first use and the auto-download path never fires. 2259 passed, 1 skipped, 0 failed. --- .dockerignore | 14 +++++++++++ Dockerfile | 17 +++++++++---- core/youtube_client.py | 56 +++++++++++++++++++++++++++++++++++++++--- 3 files changed, 78 insertions(+), 9 deletions(-) diff --git a/.dockerignore b/.dockerignore index 4b77bb74..c3344205 100644 --- a/.dockerignore +++ b/.dockerignore @@ -51,6 +51,20 @@ Incomplete/* artist_bubble_snapshots.json .spotify_cache +# Auto-downloaded ffmpeg binaries — the YouTube client downloads these +# into tools/ when system ffmpeg isn't on PATH. The Dockerfile installs +# system ffmpeg via apt, so the container never needs the bundled +# binaries. If a CI run leaves them in the workspace before the docker +# build (e.g. because a test imported web_server which initialized the +# YouTube client), they'd otherwise get baked into the image — adding +# ~388 MB and getting duplicated again by the chown layer. +tools/ffmpeg +tools/ffprobe +tools/ffmpeg.exe +tools/ffprobe.exe +tools/*.zip +tools/*.tar.xz + # Documentation *.md README.md diff --git a/Dockerfile b/Dockerfile index fbda152c..d4caefe5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -45,13 +45,20 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # Create non-root user for security RUN useradd --create-home --shell /bin/bash --uid 1000 soulsync -# Copy application code -COPY . . - -# Create necessary directories with proper permissions +# Copy application code with ownership baked in. +# Using `COPY --chown` instead of `COPY` + `chown -R /app` avoids an +# extra image layer that duplicates the entire /app tree just to flip +# ownership bits — Docker layers are immutable, so chown -R rewrites +# every file into a new layer. On a clean repo that's small; if any +# bulky workspace file slips in (e.g. auto-downloaded ffmpeg binaries +# in tools/), it gets counted twice in the image. Cin caught this on +# 2026-05-08 — see the .dockerignore comment for the same incident. +COPY --chown=soulsync:soulsync . . + +# Create runtime mount-point directories the app expects to exist. # NOTE: /app/data is for database FILES, /app/database is the Python package RUN mkdir -p /app/config /app/data /app/logs /app/downloads /app/Transfer /app/MusicVideos /app/scripts && \ - chown -R soulsync:soulsync /app + chown soulsync:soulsync /app/config /app/data /app/logs /app/downloads /app/Transfer /app/MusicVideos /app/scripts # Create defaults directory and copy template files # These will be used by entrypoint.sh to initialize empty volumes diff --git a/core/youtube_client.py b/core/youtube_client.py index 4f1c4b76..33078bfd 100644 --- a/core/youtube_client.py +++ b/core/youtube_client.py @@ -156,10 +156,27 @@ class YouTubeClient(DownloadSourcePlugin): self.matching_engine = MusicMatchingEngine() logger.info("Initialized production MusicMatchingEngine") - # Check for ffmpeg (REQUIRED for MP3 conversion) - if not self._check_ffmpeg(): - logger.error("ffmpeg is required but not found") - logger.error("The client will attempt to auto-download ffmpeg on first use") + # NOTE: deliberately don't call `_check_ffmpeg()` here. That call + # has a side effect — it auto-downloads a ~388 MB ffmpeg/ffprobe + # bundle into ./tools/ when system ffmpeg isn't on PATH. Firing + # that during __init__ means importing web_server (which any + # test does — see tests/test_tidal_auth_instructions.py) triggers + # the download, leaves the binaries in the repo workspace, and + # if the CI runner does its docker build right after, the + # binaries get baked into the image (and duplicated again by the + # chown layer). Cin reported the resulting size doubling on + # 2026-05-08 so we moved the check off the import path. + # + # `_check_ffmpeg()` still runs lazily — `is_available()` calls + # it before reporting True, and the actual download flow checks + # it before invoking yt-dlp. Both are call paths the user opted + # into by choosing YouTube as a download source. + if not self._locate_ffmpeg(): + logger.warning( + "ffmpeg not found on PATH or in tools/ — will auto-download " + "on first YouTube use. (Skipping eager download to keep " + "test/import side-effects out of the repo workspace.)" + ) # Configure yt-dlp options with bot detection bypass self.download_opts = { @@ -372,6 +389,37 @@ class YouTubeClient(DownloadSourcePlugin): """ return self.current_download_progress.copy() + def _locate_ffmpeg(self) -> bool: + """Check whether ffmpeg is already available WITHOUT side effects. + + Used at __init__ time to log a warning if ffmpeg is missing. + Does NOT trigger the auto-download — that lives in + ``_check_ffmpeg`` and only fires from call paths the user opted + into (``is_available()`` and the actual download dispatch). + """ + import shutil + + if shutil.which('ffmpeg'): + return True + + tools_dir = Path(__file__).parent.parent / 'tools' + if platform.system().lower() == 'windows': + ffmpeg_path = tools_dir / 'ffmpeg.exe' + ffprobe_path = tools_dir / 'ffprobe.exe' + else: + ffmpeg_path = tools_dir / 'ffmpeg' + ffprobe_path = tools_dir / 'ffprobe' + + if ffmpeg_path.exists() and ffprobe_path.exists(): + # Make sure yt-dlp can find them — same PATH bump + # _check_ffmpeg does on the happy path. + tools_dir_str = str(tools_dir.absolute()) + if tools_dir_str not in os.environ.get('PATH', ''): + os.environ['PATH'] = tools_dir_str + os.pathsep + os.environ.get('PATH', '') + return True + + return False + def _check_ffmpeg(self) -> bool: """Check if ffmpeg is available (system PATH or auto-download to tools folder)""" import shutil