Auto-import: dedup on folder_hash, not path — fixes silent-skip bug

User reported nothing happening on a chaotic staging root despite
6 candidates being detected. Logs showed "Processing folder" for 3
of 6 — the other 3 were silently skipped.

Root cause:

The previous commit (`a9a6168`) introduced loose-file grouping —
multiple `FolderCandidate` objects can now share a `path` (each
album group at the staging root has the same parent directory but
its own audio_files + folder_hash). But two pieces of dedup
machinery still keyed on `path`:

- `_processing_hashes` (was `_processing_paths`) — runtime set of
  in-flight candidates. Path-keyed → first sibling marks the path,
  second + third siblings hit "already in flight" and skip.

- `_folder_snapshots` — mtime cache for stability check. Path-keyed
  → siblings overwrite each other's mtimes, stability check returns
  unreliable results for whichever sibling lost the write race.

Both kept track of an attribute that was previously unique-per-path
(one candidate per directory) but my refactor broke that
invariant without updating the dedup keys. Net effect: only the
first candidate per directory ever got processed in a chaotic-root
scenario.

Fix:

- Renamed `_processing_paths` → `_processing_hashes` set, keyed on
  `candidate.folder_hash`. Hash is unique per candidate by
  construction (different audio_files lists hash differently).
- `_folder_snapshots` retyped + rekeyed to `folder_hash`. Siblings
  no longer overwrite each other's mtime tracking.
- Both touched in lockstep — comments document why path-keyed
  dedup breaks for sibling candidates.

Test added (`test_sibling_candidates_have_unique_folder_hashes`):
verifies 3-album loose root produces 3 candidates with distinct
folder_hashes. If a future change breaks the invariant, the test
fails before the silent-skip regression ships.

Verification:
- 178 imports tests pass (8 new this commit + 170 pre-existing
  this branch)
- Ruff clean
- Still scoped to import flow
pull/536/head
Broque Thomas 6 days ago
parent a9a6168568
commit a478747a89

@ -182,7 +182,13 @@ class AutoImportWorker:
# State
self._folder_snapshots: Dict[str, float] = {} # path -> mtime_sum
self._processing_paths: set = set() # Paths currently being processed (skip on rescan)
# Candidates currently being processed (skip on rescan). Keyed
# on folder_hash, NOT path — multiple candidates can share a
# path (each loose-file group at staging root has the same
# parent directory but a distinct hash from its own audio
# files). Path-keyed dedup would treat siblings as duplicates
# and silently skip all but the first.
self._processing_hashes: set = set()
self._current_folder = ''
self._current_status = 'idle' # 'idle' | 'scanning' | 'processing'
# Live per-track progress so the UI can show "Processing Speak Now
@ -296,8 +302,12 @@ class AutoImportWorker:
self._current_folder = candidate.name
# Skip folders currently being processed by a previous scan cycle
if candidate.path in self._processing_paths:
# Skip candidates currently being processed by a previous
# scan cycle. Keyed on folder_hash because multiple
# candidates can share a path (loose-file groups at the
# same directory level each get their own candidate but
# share the parent directory).
if candidate.folder_hash in self._processing_hashes:
logger.debug(f"[Auto-Import] Skipping {candidate.name} — still processing from previous cycle")
continue
@ -312,8 +322,8 @@ class AutoImportWorker:
self._stats['scanned'] += 1
logger.info(f"[Auto-Import] Processing folder: {candidate.name} ({len(candidate.audio_files)} files)")
# Mark as in-progress so next scan cycle skips this folder
self._processing_paths.add(candidate.path)
# Mark as in-progress so next scan cycle skips this candidate
self._processing_hashes.add(candidate.folder_hash)
try:
# Phase 3: Identify
identification = self._identify_folder(candidate)
@ -403,7 +413,7 @@ class AutoImportWorker:
self._record_result(candidate, 'failed', 0.0, error_message=str(e))
self._stats['failed'] += 1
finally:
self._processing_paths.discard(candidate.path)
self._processing_hashes.discard(candidate.folder_hash)
# Defensive: if the inner code path didn't reset live
# progress (early raise, etc.), clear it so the UI
# doesn't show stale "processing track 3/14" forever.
@ -636,14 +646,20 @@ class AutoImportWorker:
))
def _is_folder_stable(self, candidate: FolderCandidate) -> bool:
"""Check if folder contents have stopped changing."""
"""Check if the candidate's audio files have stopped changing.
Keyed on folder_hash, NOT path multiple candidates can share
a path (loose-file groups at the same directory level) so
path-keyed snapshots would overwrite each other's mtimes and
make stability checks unreliable for sibling candidates.
"""
try:
current_mtime = sum(os.path.getmtime(f) for f in candidate.audio_files if os.path.exists(f))
except OSError:
return False
prev = self._folder_snapshots.get(candidate.path)
self._folder_snapshots[candidate.path] = current_mtime
prev = self._folder_snapshots.get(candidate.folder_hash)
self._folder_snapshots[candidate.folder_hash] = current_mtime
if prev is None:
return False # First scan — wait for next cycle to confirm stability

@ -242,6 +242,39 @@ def test_disc_folder_with_no_matching_loose_group_becomes_standalone(worker, tmp
# ---------------------------------------------------------------------------
def test_sibling_candidates_have_unique_folder_hashes(worker, tmp_path):
"""Pin the dedup-collision bug: when loose files at one level
produce multiple candidates (one per album group), each must have
a unique folder_hash. The runtime's `_processing_hashes` set + the
DB's `auto_import_history.folder_hash` index both key on this —
if siblings collide, only the first one gets processed and the
rest silently skip as "still processing from previous cycle."
Reported on 2026-05-09 multi-album loose root produced 3
candidates with the same path. Path-keyed dedup treated them as
duplicates and skipped two of three. Fixed by switching dedup to
folder_hash + ensuring each candidate's hash is unique."""
for i in range(1, 4):
_write_flac(
str(tmp_path / f'A_{i}.flac'),
album='Album A', track=i, title=f'A {i}',
)
for i in range(1, 4):
_write_flac(
str(tmp_path / f'B_{i}.flac'),
album='Album B', track=i, title=f'B {i}',
)
candidates = []
worker._scan_directory(str(tmp_path), candidates, staging_root=str(tmp_path))
hashes = [c.folder_hash for c in candidates]
assert len(hashes) == len(set(hashes)), (
f"Sibling candidates collided on folder_hash: {hashes}. "
f"Path-keyed dedup would silently skip duplicates."
)
def test_disc_only_directory_still_works(worker, tmp_path):
"""No loose files, only Disc 1/Disc 2 subfolders → treat parent
directory as the album candidate. Pre-existing behavior preserved."""

Loading…
Cancel
Save