From a9a6168568e42cdda4a2d7951b3bd08c63df7f65 Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Sat, 9 May 2026 11:37:36 -0700 Subject: [PATCH] Auto-import scanner: group loose files by album + always recurse subfolders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related bugs in `AutoImportWorker._scan_directory` surfaced during real-world testing of the chaotic-staging case (user dropped loose tracks from multiple albums at staging root, alongside intact album subfolders): Bug 1 — Loose files bundled into one fake "album" When loose audio files existed at a level, the scanner built ONE FolderCandidate from all of them regardless of their album tags. On a chaotic staging root with tracks from 3+ different albums, the identifier picked the most-common album tag and the matcher left every other album's tracks unmatched (or mis-attributed via filename + position guessing). Bug 2 — Subfolders silently ignored when root has loose files The scanner only recursed into non-disc subfolders when there were NO loose files at the parent level. So a layout like: Staging/ loose1.flac (processed via the loose-files path) Other Album Folder/ (silently ignored — never scanned) would skip the album subfolders entirely. Common pattern when a user moves a few tracks out of an album folder while leaving the rest of the parent album folder intact, OR when other album folders sit alongside a partially-extracted album. Fix: `_build_loose_file_candidates` (new method) reads each loose file's `album` tag and groups by normalised album name. Each group becomes its own FolderCandidate so a chaotic staging root produces one candidate per album — identifier + matcher run cleanly per album. Untagged loose files become individual single candidates. Disc folders at the same level attach to whichever loose-file group's album tag matches the disc-folder tracks; standalone disc folders (no matching loose group) get their own multi-disc candidate. The scanner now ALSO always recurses into non-disc subdirectories, even when the current level has loose files. So album subfolders sitting beside loose tracks get processed independently in their own recursive scan. Behavior preservation: - Single-album loose-files staging (every file shares one album tag, no parallel disc folders) → one FolderCandidate, identical to pre-fix behavior. Pinned by `test_single_album_loose_files_still_one_candidate`. - Disc-only directory (no loose files, only Disc 1/Disc 2 subdirs) → one multi-disc FolderCandidate, identical to pre-fix. Pinned by `test_disc_only_directory_still_works`. 7 new tests in `tests/imports/test_auto_import_scanner_grouping.py`: - Multiple-album loose root → multiple candidates - Untagged loose files → individual singles - Single-album loose-files regression guard - Subfolders recursed even when root has loose files - Disc folder attaches to matching loose group by album tag - Disc folder with no matching loose group → standalone candidate - Disc-only directory regression guard All write real FLACs via mutagen + exercise `_scan_directory` end-to-end (no mocking the tag reader — proves the production read path works). Verification: - 7 new tests pass - 2328 full suite passes (+7 new), 1 pre-existing flaky timing test unrelated to this PR - Ruff clean - All changes still scoped to import flow — download flow byte- identical --- core/auto_import_worker.py | 252 ++++++++++++----- .../test_auto_import_scanner_grouping.py | 266 ++++++++++++++++++ 2 files changed, 444 insertions(+), 74 deletions(-) create mode 100644 tests/imports/test_auto_import_scanner_grouping.py diff --git a/core/auto_import_worker.py b/core/auto_import_worker.py index b8dd1e88..42190f9e 100644 --- a/core/auto_import_worker.py +++ b/core/auto_import_worker.py @@ -436,13 +436,33 @@ class AutoImportWorker: return candidates def _scan_directory(self, directory: str, candidates: List[FolderCandidate], staging_root: str = ''): - """Recursively scan a directory for album folders and loose audio files.""" + """Recursively scan a directory for album folders and loose audio files. + + Loose-file handling: + - Read each loose file's `album` tag and group by normalised + album name. Each group becomes its own candidate so a chaotic + staging root (multiple albums dumped loose) imports correctly + instead of bundling everything into one fake "album." + - Untagged loose files become individual single candidates (they + have nothing to group with). + - Disc folders at the same level attach to the loose-file group + whose album tag matches the disc-folder files (typical layout: + loose files for disc 1 + `Disc 2/`, `Disc 3/` subfolders). + - Disc folders with no matching loose group become standalone + multi-disc candidates. + + Recursion rule: + - Always recurse into non-disc subdirectories. The previous + rule "only recurse when no loose files exist" silently + ignored album subfolders sitting next to loose files — + common when a user moves some tracks out of an album folder + while leaving the parent album folder intact. + """ try: entries = sorted(os.listdir(directory)) except OSError: return - # Collect loose audio files at this level loose_files = [] subdirs = [] @@ -453,83 +473,167 @@ class AutoImportWorker: elif os.path.isdir(full_path): subdirs.append((entry, full_path)) - if loose_files: - # This directory has audio files — treat it as an album folder candidate - audio_files = loose_files - disc_structure = {} + disc_subdirs = [(n, p) for n, p in subdirs if DISC_FOLDER_RE.match(n)] + non_disc_subdirs = [(n, p) for n, p in subdirs if not DISC_FOLDER_RE.match(n)] - # Check if any subdirs are disc folders - has_disc_folders = False - for sub_name, sub_path in subdirs: - disc_match = DISC_FOLDER_RE.match(sub_name) - if disc_match: - has_disc_folders = True - disc_num = int(disc_match.group(1)) - disc_files = [os.path.join(sub_path, f) for f in sorted(os.listdir(sub_path)) - if os.path.isfile(os.path.join(sub_path, f)) - and os.path.splitext(f)[1].lower() in AUDIO_EXTENSIONS] - if disc_files: - disc_structure[disc_num] = disc_files - audio_files.extend(disc_files) - - if has_disc_folders: - disc_structure[0] = loose_files # Top-level files are disc 0 - - # Determine if this is a single or album - is_single = len(audio_files) == 1 and not has_disc_folders - folder_name = os.path.basename(directory) - folder_hash = _compute_folder_hash(audio_files) + # Build disc_structure from disc subdirs once — referenced by + # both the loose-files branch (to attach matching discs to the + # right loose-file group) and the disc-only branch. + disc_files_by_num: Dict[int, List[str]] = {} + for sub_name, sub_path in disc_subdirs: + disc_num = int(DISC_FOLDER_RE.match(sub_name).group(1)) + try: + disc_files = [os.path.join(sub_path, f) for f in sorted(os.listdir(sub_path)) + if os.path.isfile(os.path.join(sub_path, f)) + and os.path.splitext(f)[1].lower() in AUDIO_EXTENSIONS] + except OSError: + disc_files = [] + if disc_files: + disc_files_by_num[disc_num] = disc_files - if is_single: - candidates.append(FolderCandidate( - path=audio_files[0], name=os.path.basename(audio_files[0]), - audio_files=audio_files, folder_hash=folder_hash, is_single=True - )) - else: + if loose_files: + self._build_loose_file_candidates( + directory, loose_files, disc_files_by_num, candidates, + ) + elif disc_files_by_num and not non_disc_subdirs: + # Disc-only directory — treat THIS directory as the album. + # Common when a user drops `Disc 1/`, `Disc 2/` straight + # into staging without an album-level loose-file group. + audio_files: List[str] = [] + disc_structure: Dict[int, List[str]] = {} + for disc_num, disc_files in disc_files_by_num.items(): + disc_structure[disc_num] = disc_files + audio_files.extend(disc_files) + + if audio_files: + folder_name = os.path.basename(directory) + folder_hash = _compute_folder_hash(audio_files) + is_staging_root = bool(staging_root) and os.path.normpath(directory) == os.path.normpath(staging_root) candidates.append(FolderCandidate( path=directory, name=folder_name, audio_files=audio_files, - disc_structure=disc_structure, folder_hash=folder_hash + disc_structure=disc_structure, folder_hash=folder_hash, + is_staging_root=is_staging_root, )) - else: - # No loose audio files. If the only subdirs are disc folders, - # treat THIS directory as the album candidate (multi-disc album - # with no album-level loose files — common when a user drops - # `Album/Disc 1/`, `Album/Disc 2/` straight into staging, or - # drops `Disc 1/`, `Disc 2/` with the staging dir itself as - # the album root). - disc_subdirs = [(n, p) for n, p in subdirs if DISC_FOLDER_RE.match(n)] - non_disc_subdirs = [(n, p) for n, p in subdirs if not DISC_FOLDER_RE.match(n)] - - if disc_subdirs and not non_disc_subdirs: - disc_structure = {} - audio_files = [] - for sub_name, sub_path in disc_subdirs: - disc_num = int(DISC_FOLDER_RE.match(sub_name).group(1)) - try: - disc_files = [os.path.join(sub_path, f) for f in sorted(os.listdir(sub_path)) - if os.path.isfile(os.path.join(sub_path, f)) - and os.path.splitext(f)[1].lower() in AUDIO_EXTENSIONS] - except OSError: - disc_files = [] - if disc_files: - disc_structure[disc_num] = disc_files - audio_files.extend(disc_files) - - if audio_files: - folder_name = os.path.basename(directory) - folder_hash = _compute_folder_hash(audio_files) - is_staging_root = bool(staging_root) and os.path.normpath(directory) == os.path.normpath(staging_root) - candidates.append(FolderCandidate( - path=directory, name=folder_name, audio_files=audio_files, - disc_structure=disc_structure, folder_hash=folder_hash, - is_staging_root=is_staging_root, - )) - return - - # Otherwise recurse into non-disc subdirs (disc folders only - # ever attach to a parent album, never stand alone). - for _sub_name, sub_path in non_disc_subdirs: - self._scan_directory(sub_path, candidates, staging_root=staging_root) + + # Always recurse into non-disc subdirectories — even when this + # level has loose files. Otherwise album subfolders sitting + # beside loose tracks get silently ignored (the bug a chaotic + # staging root surfaced on 2026-05-09). + for _sub_name, sub_path in non_disc_subdirs: + self._scan_directory(sub_path, candidates, staging_root=staging_root) + + def _build_loose_file_candidates( + self, + directory: str, + loose_files: List[str], + disc_files_by_num: Dict[int, List[str]], + candidates: List[FolderCandidate], + ) -> None: + """Group loose audio files by `album` tag, build one candidate + per album group + attach matching disc folders. + + - Tagged files cluster by their album name (case-insensitive, + whitespace-stripped). + - Untagged files become individual single candidates (can't + group what we don't have a key for). + - Disc folders attach to whichever loose group's album tag + matches the first disc-folder track's album tag. Disc folders + with no matching loose group fall through to a standalone + multi-disc candidate scoped to that album. + - When all loose files share one album AND disc folders attach + to it, the result matches the previous "bundle everything" + behavior — so single-album staging with parallel disc folders + (the user's Mr. Morale layout) keeps working unchanged. + """ + # Group by normalised album tag + groups: Dict[str, List[str]] = {} + untagged: List[str] = [] + for f in loose_files: + try: + tags = _read_file_tags(f) + except Exception as exc: + logger.debug("scan tag read failed for %s: %s", f, exc) + tags = {} + album_key = (tags.get('album') or '').strip().lower() + if album_key: + groups.setdefault(album_key, []).append(f) + else: + untagged.append(f) + + # Attach disc folders to matching groups. Read the first track + # of each disc to find its album tag and merge accordingly. + disc_attached_to: Dict[int, str] = {} # disc_num → album_key + for disc_num, disc_files in disc_files_by_num.items(): + try: + first_disc_tags = _read_file_tags(disc_files[0]) + except Exception: + first_disc_tags = {} + disc_album_key = (first_disc_tags.get('album') or '').strip().lower() + if disc_album_key and disc_album_key in groups: + disc_attached_to[disc_num] = disc_album_key + + # Track which disc nums got merged into a loose group so we + # don't double-count them in the standalone-disc fallback. + merged_disc_nums = set(disc_attached_to.keys()) + + # Build a candidate per loose-file group + for album_key, group_files in groups.items(): + audio_files = list(group_files) + disc_structure: Dict[int, List[str]] = {0: list(group_files)} + for disc_num, attached_album in disc_attached_to.items(): + if attached_album == album_key: + audio_files.extend(disc_files_by_num[disc_num]) + disc_structure[disc_num] = list(disc_files_by_num[disc_num]) + + folder_hash = _compute_folder_hash(audio_files) + # Use the album tag for the candidate name so the import + # history shows something meaningful instead of always the + # parent directory name. + display_name = group_files[0] + try: + first_tags = _read_file_tags(group_files[0]) + if first_tags.get('album'): + display_name = first_tags['album'] + except Exception as exc: + logger.debug("display-name tag read failed for %s: %s", group_files[0], exc) + + candidates.append(FolderCandidate( + path=directory, + name=os.path.basename(directory) if len(groups) == 1 else str(display_name), + audio_files=audio_files, + disc_structure=disc_structure if len(disc_structure) > 1 else {}, + folder_hash=folder_hash, + )) + + # Untagged singles — one candidate per file. Can't group them. + for f in untagged: + audio_files = [f] + folder_hash = _compute_folder_hash(audio_files) + candidates.append(FolderCandidate( + path=f, name=os.path.basename(f), + audio_files=audio_files, folder_hash=folder_hash, is_single=True, + )) + + # Standalone disc folders (no loose group claimed them) — bundle + # into a multi-disc candidate scoped to the directory. + unattached_discs = { + n: files for n, files in disc_files_by_num.items() + if n not in merged_disc_nums + } + if unattached_discs: + audio_files = [] + disc_structure = {} + for disc_num, disc_files in unattached_discs.items(): + disc_structure[disc_num] = disc_files + audio_files.extend(disc_files) + folder_hash = _compute_folder_hash(audio_files) + candidates.append(FolderCandidate( + path=directory, + name=f"{os.path.basename(directory)} (loose discs)", + audio_files=audio_files, + disc_structure=disc_structure, + folder_hash=folder_hash, + )) def _is_folder_stable(self, candidate: FolderCandidate) -> bool: """Check if folder contents have stopped changing.""" diff --git a/tests/imports/test_auto_import_scanner_grouping.py b/tests/imports/test_auto_import_scanner_grouping.py new file mode 100644 index 00000000..1d13c831 --- /dev/null +++ b/tests/imports/test_auto_import_scanner_grouping.py @@ -0,0 +1,266 @@ +"""Tests for the chaotic-staging scanner improvements in +``AutoImportWorker._scan_directory``. + +Two related behaviors pinned here: + +1. **Loose files grouped by album tag.** When the staging root has + loose files from multiple different albums (user moved tracks out + of their album folders + dumped them at root), each album's tracks + get their own candidate via the embedded `album` tag. Pre-fix: + everything bundled into one fake album, identifier picked the + most-common tag, other albums' tracks left unmatched. + +2. **Always recurse into non-disc subfolders.** Pre-fix the scanner + would skip subfolders entirely when loose files existed at the + same level. So a layout like:: + + Staging/ + loose1.flac ← processed + Disc 1/ ← attached to loose + Album Folder/ ← IGNORED + + would silently skip "Album Folder" because root had loose files. + Post-fix: subfolders always recursed regardless of loose files. + +Tests use temp directories with real FLAC files (mutagen-written) +so the scanner's tag reads exercise the real code path. +""" + +from __future__ import annotations + +import os +import struct +import tempfile + +import pytest + +from core.auto_import_worker import AutoImportWorker + + +def _write_flac(path: str, *, album: str = '', track: int = 0, disc: int = 1, title: str = 'Test'): + """Write a real FLAC with the given tags. Same minimal-FLAC + bootstrap pattern used elsewhere in the test suite.""" + from mutagen.flac import FLAC + + fLaC = b'fLaC' + streaminfo = bytearray(34) + streaminfo[0:2] = struct.pack('>H', 4096) + streaminfo[2:4] = struct.pack('>H', 4096) + streaminfo[10] = 0x0A + streaminfo[12] = 0x70 + block_header = bytes([0x80, 0x00, 0x00, 0x22]) + with open(path, 'wb') as f: + f.write(fLaC + block_header + bytes(streaminfo)) + + audio = FLAC(path) + if album: + audio['ALBUM'] = album + if track: + audio['TRACKNUMBER'] = str(track) + if disc: + audio['DISCNUMBER'] = str(disc) + if title: + audio['TITLE'] = title + audio.save() + + +@pytest.fixture +def worker(): + """Bare worker — `_scan_directory` doesn't need full deps.""" + return AutoImportWorker.__new__(AutoImportWorker) + + +# --------------------------------------------------------------------------- +# Loose-file grouping by album tag +# --------------------------------------------------------------------------- + + +def test_loose_files_from_multiple_albums_become_multiple_candidates(worker, tmp_path): + """Two albums' worth of tracks at root → two candidates, not one + bundle. Validates the chaotic-staging fix.""" + # Album A: 3 tracks + for i in range(1, 4): + _write_flac( + str(tmp_path / f'A_{i}.flac'), + album='Album A', track=i, title=f'A track {i}', + ) + # Album B: 2 tracks + for i in range(1, 3): + _write_flac( + str(tmp_path / f'B_{i}.flac'), + album='Album B', track=i, title=f'B track {i}', + ) + + candidates = [] + worker._scan_directory(str(tmp_path), candidates, staging_root=str(tmp_path)) + + assert len(candidates) == 2 + album_keys = sorted( + len(c.audio_files) for c in candidates if not c.is_single + ) + assert album_keys == [2, 3] # one 3-track album + one 2-track album + + +def test_untagged_loose_files_become_individual_singles(worker, tmp_path): + """Files with no album tag can't be grouped — each becomes its + own single candidate.""" + _write_flac(str(tmp_path / 'orphan_a.flac'), album='', track=0) + _write_flac(str(tmp_path / 'orphan_b.flac'), album='', track=0) + + candidates = [] + worker._scan_directory(str(tmp_path), candidates, staging_root=str(tmp_path)) + + singles = [c for c in candidates if c.is_single] + assert len(singles) == 2 + + +def test_single_album_loose_files_still_one_candidate(worker, tmp_path): + """Regression guard — when all loose files share an album, behavior + matches the previous "bundle everything into one candidate" path. + Single-album staging shouldn't fragment into per-track singles.""" + for i in range(1, 6): + _write_flac( + str(tmp_path / f'track_{i}.flac'), + album='Single Album', track=i, title=f'Song {i}', + ) + + candidates = [] + worker._scan_directory(str(tmp_path), candidates, staging_root=str(tmp_path)) + + album_candidates = [c for c in candidates if not c.is_single] + assert len(album_candidates) == 1 + assert len(album_candidates[0].audio_files) == 5 + + +# --------------------------------------------------------------------------- +# Always-recurse-into-subfolders +# --------------------------------------------------------------------------- + + +def test_subfolders_processed_even_when_root_has_loose_files(worker, tmp_path): + """The original bug — root has loose files AND a non-disc + subfolder. Pre-fix: subfolder ignored. Post-fix: subfolder + recursed.""" + # Loose file at root + _write_flac( + str(tmp_path / 'loose.flac'), + album='Loose Album', track=1, title='Loose Song', + ) + + # Subfolder with its own album + sub = tmp_path / 'Other Album' + sub.mkdir() + for i in range(1, 4): + _write_flac( + str(sub / f't{i}.flac'), + album='Other Album', track=i, title=f'Other {i}', + ) + + candidates = [] + worker._scan_directory(str(tmp_path), candidates, staging_root=str(tmp_path)) + + # 1 candidate from loose + 1 candidate from subfolder = 2 + assert len(candidates) == 2 + paths = {c.path for c in candidates} + assert any('Other Album' in p for p in paths), ( + f"Subfolder candidate missing — paths: {paths}. Pre-fix " + f"behavior: scanner ignored the subfolder when root had " + f"loose files." + ) + + +# --------------------------------------------------------------------------- +# Disc folder attachment to loose-file groups +# --------------------------------------------------------------------------- + + +def test_disc_folder_attaches_to_matching_loose_group(worker, tmp_path): + """Loose Mr. Morale tracks at root + Disc 2 folder also tagged + Mr. Morale → disc folder merges into the Mr. Morale loose + candidate. Mirrors the user's typical multi-disc layout.""" + # Loose disc 1 tracks + for i in range(1, 4): + _write_flac( + str(tmp_path / f'disc1_{i}.flac'), + album='Mr. Morale', track=i, disc=1, title=f'D1 {i}', + ) + + # Disc 2 folder, same album + disc2 = tmp_path / 'Disc 2' + disc2.mkdir() + for i in range(1, 4): + _write_flac( + str(disc2 / f'd2_{i}.flac'), + album='Mr. Morale', track=i, disc=2, title=f'D2 {i}', + ) + + candidates = [] + worker._scan_directory(str(tmp_path), candidates, staging_root=str(tmp_path)) + + assert len(candidates) == 1 + candidate = candidates[0] + # All 6 files (3 loose + 3 disc 2) merged into one candidate + assert len(candidate.audio_files) == 6 + # Disc structure carries both disc 0 (loose) + disc 2 + assert 0 in candidate.disc_structure + assert 2 in candidate.disc_structure + + +def test_disc_folder_with_no_matching_loose_group_becomes_standalone(worker, tmp_path): + """Loose Album A tracks at root + Disc 2 folder tagged Album B → + disc folder doesn't merge into A; becomes its own candidate.""" + _write_flac( + str(tmp_path / 'a1.flac'), + album='Album A', track=1, title='A1', + ) + _write_flac( + str(tmp_path / 'a2.flac'), + album='Album A', track=2, title='A2', + ) + + disc2 = tmp_path / 'Disc 2' + disc2.mkdir() + _write_flac( + str(disc2 / 'b1.flac'), + album='Album B', track=1, disc=2, title='B1', + ) + + candidates = [] + worker._scan_directory(str(tmp_path), candidates, staging_root=str(tmp_path)) + + # 1 candidate for Album A loose + 1 candidate for the standalone + # disc folder = 2 total + assert len(candidates) == 2 + a_candidate = next(c for c in candidates if len(c.audio_files) == 2) + standalone = next(c for c in candidates if c is not a_candidate) + assert len(a_candidate.audio_files) == 2 # only Album A loose, no disc + assert len(standalone.audio_files) == 1 # Album B disc 2 alone + + +# --------------------------------------------------------------------------- +# Disc-only directory (regression guard) +# --------------------------------------------------------------------------- + + +def test_disc_only_directory_still_works(worker, tmp_path): + """No loose files, only Disc 1/Disc 2 subfolders → treat parent + directory as the album candidate. Pre-existing behavior preserved.""" + for disc_num in (1, 2): + disc_dir = tmp_path / f'Disc {disc_num}' + disc_dir.mkdir() + for i in range(1, 4): + _write_flac( + str(disc_dir / f'd{disc_num}_t{i}.flac'), + album='Disc Only Album', track=i, disc=disc_num, + title=f'D{disc_num}T{i}', + ) + + candidates = [] + worker._scan_directory(str(tmp_path), candidates, staging_root=str(tmp_path)) + + assert len(candidates) == 1 + assert len(candidates[0].audio_files) == 6 + assert candidates[0].disc_structure == { + 1: candidates[0].disc_structure[1], + 2: candidates[0].disc_structure[2], + }