From f628009ab4562d1b44adc2de6315c20337feb822 Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Sat, 9 May 2026 20:15:49 -0700 Subject: [PATCH] Auto-import: aggregate GENRE tags onto artists row + harden ISRC/MBID types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cin pre-review followup. Two small parity gaps the prior commits left open: # 1. Genre tags land on the standalone artists row `soulsync_client._scan_transfer` aggregates the GENRE tag across every track in an album and surfaces it on `SoulSyncAlbum.genres` (which the DatabaseUpdateWorker writes to the artists+albums row). Auto-import was hardcoding `'spotify_artist': {'genres': []}` so the imported artists row landed with empty genres — felt hollow compared to a Plex/Jellyfin scan, which both pull genres from their respective APIs. Fix: - `_read_file_tags` now reads the GENRE tag (mutagen easy mode handles MP3/FLAC/M4A consistently; some files carry multiple genres so it's always returned as a list). - `_process_matches` aggregates genres from each matched file's tags into a deduped insertion-order list. Dedup is case-insensitive but preserves original casing — so "Hip-Hop, Rap, Trap" reads naturally in the JSON column instead of "hip-hop, rap, trap". - Worker context's `spotify_artist['genres']` carries the aggregated list, which `record_soulsync_library_entry` already filters via `core.genre_filter.filter_genres` and writes to the artists row. # 2. Defensive str() cast for ISRC + MBID `_build_album_track_entry` already coerces ISRC + MBID to string today (via `str(isrc) if isrc else ''`). But if a future metadata-source client returns int / None for either ID, the worker would propagate the wrong type and side_effects.py's `.strip()` would AttributeError. Cheap insurance: explicit `str()` cast in the worker before assignment to track_info. Future-proofs against client drift. # Tests added (3, in test_auto_import_context_shape.py): - `test_context_aggregates_genres_from_track_tags` — multi-file album with overlapping genre lists produces deduped, insertion- ordered, original-case-preserved result. Stubs `_read_file_tags` with monkeypatch so we don't need real audio. - `test_context_genres_empty_when_no_tags` — files without GENRE tag → empty list. Standalone library write handles gracefully (genres column stays empty / NULL). - `test_context_isrc_mbid_coerced_to_string` — hostile types (int 12345678, None, int 999) coerced to safe strings before reaching track_info. # Verification - 14/14 context-shape tests pass (11 prior + 3 new) - 213 imports tests pass (no regression) - 2365 full suite passes (+3 from prior, +18 PR-total) - 1 pre-existing flake (`test_watchdog_warns_about_stuck_workers`, passes in isolation) - Ruff clean --- core/auto_import_worker.py | 63 ++++++++- .../imports/test_auto_import_context_shape.py | 129 ++++++++++++++++++ webui/static/helper.js | 1 + 3 files changed, 188 insertions(+), 5 deletions(-) diff --git a/core/auto_import_worker.py b/core/auto_import_worker.py index 10f1bb73..1f3d6591 100644 --- a/core/auto_import_worker.py +++ b/core/auto_import_worker.py @@ -82,7 +82,7 @@ def _read_file_tags(file_path: str) -> Dict[str, Any]: """Read embedded tags from an audio file. Returns dict with: title, artist, album, track_number, disc_number, - year, isrc, mbid, duration_ms. + year, genres, isrc, mbid, duration_ms. The exact-identifier fields (``isrc``, ``mbid``) and the audio duration enable the ID-based fast paths + duration sanity gate in @@ -90,6 +90,12 @@ def _read_file_tags(file_path: str) -> Dict[str, Any]: libraries always carry MBID; most metadata sources carry ISRC) get perfect-match identification without going through fuzzy scoring. + ``genres`` is a list of strings — Mutagen's easy mode returns the + GENRE tag as a list (some files carry multiple genres). Empty list + when the tag is absent. Worker aggregates these across an album's + tracks to populate the artist row's genres column at insert time + (matches the soulsync_client deep-scan behaviour). + All exact-identifier fields default to empty string when the tag isn't present — callers treat empty as "not available, fall back to fuzzy matching". @@ -97,7 +103,7 @@ def _read_file_tags(file_path: str) -> Dict[str, Any]: result = { 'title': '', 'artist': '', 'album': '', 'track_number': 0, 'disc_number': 1, 'year': '', - 'isrc': '', 'mbid': '', 'duration_ms': 0, + 'genres': [], 'isrc': '', 'mbid': '', 'duration_ms': 0, } try: from mutagen import File as MutagenFile @@ -136,6 +142,16 @@ def _read_file_tags(file_path: str) -> Dict[str, Any]: result['disc_number'] = int(str(dn).split('/')[0]) except (ValueError, TypeError): pass + # GENRE — Mutagen easy mode returns a list (some files + # carry multiple genres, e.g. "Hip-Hop;Rap;Trap"). Skip + # empty / whitespace entries so the aggregator doesn't + # have to filter them. + raw_genres = tags.get('genre', []) or [] + if isinstance(raw_genres, str): + raw_genres = [raw_genres] + result['genres'] = [ + str(g).strip() for g in raw_genres if str(g).strip() + ] # ISRC — International Standard Recording Code. Per-recording # unique identifier; metadata sources expose it as `isrc` on # tracks. Picard / Beets both write this tag from MusicBrainz. @@ -1526,6 +1542,28 @@ class AutoImportWorker: # the loop denominator so users see "3/14" while it's working. self._update_active(candidate.folder_hash, track_total=len(all_matches)) + # Aggregate genres from track tags so the standalone library + # write can populate the artists row's `genres` column with + # something meaningful. Mirrors what `soulsync_client._scan_transfer` + # does at deep-scan time — collects the set of genres across + # every track in the album. Without this the artists row gets + # genres=[] and feels empty compared to a Plex/Jellyfin scan. + # Sorted for deterministic ordering (genre-filter dedup uses + # set semantics so this is just for stable JSON output). + aggregated_genres: List[str] = [] + seen_genres: set = set() + for _m in all_matches: + try: + _file_tags = _read_file_tags(_m['file']) + except Exception as _tag_err: + logger.debug("genre tag read failed for %s: %s", _m.get('file'), _tag_err) + continue + for g in _file_tags.get('genres', []) or []: + key = g.lower() + if key and key not in seen_genres: + seen_genres.add(key) + aggregated_genres.append(g) + for index, match in enumerate(all_matches, start=1): track = match['track'] file_path = match['file'] @@ -1574,8 +1612,17 @@ class AutoImportWorker: # metadata layer (`_build_album_track_entry`) so files # tagged with these IDs can match later watchlist scans # without relying on fuzzy title comparison. - track_isrc = track.get('isrc', '') or '' - track_mbid = track.get('musicbrainz_recording_id', '') or track.get('mbid', '') or '' + # Defensive `str()` cast — `_build_album_track_entry` + # already coerces these to str, but if a future source + # client returns a non-string (int, None) the + # downstream `.strip()` in side_effects would + # AttributeError. Cheap insurance. + track_isrc = str(track.get('isrc', '') or '') + track_mbid = str( + track.get('musicbrainz_recording_id', '') + or track.get('mbid', '') + or '' + ) context = { # Top-level `source` is the canonical signal that the # imports pipeline reads via `get_import_source()`. @@ -1593,7 +1640,13 @@ class AutoImportWorker: 'spotify_artist': { 'id': identification.get('artist_id') or '', 'name': artist_name, - 'genres': [], + # Genres aggregated from the matched files' + # GENRE tags (deduped, original-case preserved). + # Mirrors soulsync_client deep-scan behaviour + # so the standalone library write populates + # the artists row's genres column instead of + # leaving it empty. + 'genres': list(aggregated_genres), }, 'spotify_album': { 'id': source_album_id, diff --git a/tests/imports/test_auto_import_context_shape.py b/tests/imports/test_auto_import_context_shape.py index 3cb6995a..de9d797f 100644 --- a/tests/imports/test_auto_import_context_shape.py +++ b/tests/imports/test_auto_import_context_shape.py @@ -325,6 +325,135 @@ def test_context_artist_id_is_empty_when_identification_missing_it(worker_with_c ) +# --------------------------------------------------------------------------- +# Genre aggregation — soulsync standalone parity with deep-scan behaviour +# --------------------------------------------------------------------------- + + +def test_context_aggregates_genres_from_track_tags(worker_with_capture, tmp_path, monkeypatch): + """Worker reads GENRE tag from each matched file and surfaces a + deduped list on `spotify_artist['genres']`. Mirrors what + `soulsync_client._scan_transfer` does at deep-scan time so the + standalone library write populates the artists row's genres + column instead of leaving it empty (which is what plex/jellyfin/ + navidrome scans would have provided).""" + from core import auto_import_worker as worker_mod + + files = [] + for i in range(1, 4): + f = tmp_path / f"0{i}.flac" + f.write_bytes(b"audio") + files.append(f) + + # Stub `_read_file_tags` so we don't need real audio. Each file + # carries a different (overlapping) genre set — deduped result + # should preserve insertion order + original casing. + fake_tags = { + str(files[0]): {'genres': ['Hip-Hop', 'Rap'], 'isrc': '', 'mbid': '', + 'duration_ms': 200000, 'title': 'A', 'artist': 'X', + 'album': 'Album', 'track_number': 1, 'disc_number': 1, 'year': ''}, + str(files[1]): {'genres': ['Rap', 'Trap'], 'isrc': '', 'mbid': '', + 'duration_ms': 200000, 'title': 'B', 'artist': 'X', + 'album': 'Album', 'track_number': 2, 'disc_number': 1, 'year': ''}, + str(files[2]): {'genres': ['hip-hop'], 'isrc': '', 'mbid': '', # case-insensitive dup + 'duration_ms': 200000, 'title': 'C', 'artist': 'X', + 'album': 'Album', 'track_number': 3, 'disc_number': 1, 'year': ''}, + } + monkeypatch.setattr(worker_mod, '_read_file_tags', + lambda path: fake_tags.get(str(path), {'genres': []})) + + cand = _FakeCandidate(path=str(tmp_path), name="Album", + audio_files=[str(f) for f in files]) + ident = _make_identification("spotify") + mr = _make_match_result("spotify", 3) + mr["matches"] = [ + {"track": {"id": f"t{i}", "name": f"Track {i}", "track_number": i, + "disc_number": 1, "duration_ms": 200000, + "artists": [{"name": "X"}]}, + "file": str(files[i - 1]), "confidence": 0.95} + for i in range(1, 4) + ] + + worker_with_capture._process_matches(cand, ident, mr) + + ctx = worker_with_capture._captured[0] + genres = ctx["spotify_artist"]["genres"] + # Insertion-order preserved: Hip-Hop (file 1), Rap (file 1), Trap (file 2). + # 'hip-hop' from file 3 deduped against 'Hip-Hop' (case-insensitive). + assert genres == ["Hip-Hop", "Rap", "Trap"], ( + f"Expected deduped insertion-order genres, got {genres}" + ) + + +def test_context_genres_empty_when_no_tags(worker_with_capture, tmp_path, monkeypatch): + """No GENRE tag on any file → empty list. Standalone library write + handles empty list gracefully (genres column stays empty / NULL).""" + from core import auto_import_worker as worker_mod + + f = tmp_path / "01.flac" + f.write_bytes(b"audio") + monkeypatch.setattr(worker_mod, '_read_file_tags', + lambda path: {'genres': [], 'isrc': '', 'mbid': '', + 'duration_ms': 200000, 'title': '', 'artist': '', + 'album': '', 'track_number': 1, 'disc_number': 1, 'year': ''}) + + cand = _FakeCandidate(path=str(tmp_path), name="Album", audio_files=[str(f)]) + ident = _make_identification("spotify") + mr = _make_match_result("spotify", 1) + mr["matches"] = [{ + "track": {"id": "t1", "name": "Track", "track_number": 1, + "disc_number": 1, "duration_ms": 200000, + "artists": [{"name": "A"}]}, + "file": str(f), "confidence": 0.95, + }] + + worker_with_capture._process_matches(cand, ident, mr) + + assert worker_with_capture._captured[0]["spotify_artist"]["genres"] == [] + + +# --------------------------------------------------------------------------- +# Defensive ISRC/MBID type coercion +# --------------------------------------------------------------------------- + + +def test_context_isrc_mbid_coerced_to_string(worker_with_capture, tmp_path): + """If a metadata source returns ISRC or MBID as int / non-string + (no current source does, but defensive against future drift), + the worker coerces to string before assignment so the side-effects + layer's `.strip()` doesn't AttributeError.""" + f = tmp_path / "01.flac" + f.write_bytes(b"audio") + cand = _FakeCandidate(path=str(tmp_path), name="Album", audio_files=[str(f)]) + ident = _make_identification("deezer") + mr = _make_match_result("deezer", 1) + mr["matches"] = [{ + "track": { + "id": "111", + "name": "Track", + "track_number": 1, + "disc_number": 1, + "duration_ms": 200000, + "artists": [{"name": "A"}], + # Hostile types: ints / None — must not propagate + # through to side_effects un-cast. + "isrc": 12345678, + "mbid": None, + "musicbrainz_recording_id": 999, + }, + "file": str(f), "confidence": 0.95, + }] + + worker_with_capture._process_matches(cand, ident, mr) + + ti = worker_with_capture._captured[0]["track_info"] + assert isinstance(ti["isrc"], str) + assert isinstance(ti["musicbrainz_recording_id"], str) + # int 12345678 → "12345678", int 999 → "999" + assert ti["isrc"] == "12345678" + assert ti["musicbrainz_recording_id"] == "999" + + def test_search_metadata_source_extracts_artist_id_from_dict_artist(): """`_search_metadata_source` must extract the artist source ID from `best_result.artists[0]['id']` so identification carries it diff --git a/webui/static/helper.js b/webui/static/helper.js index 67127e05..eb882149 100644 --- a/webui/static/helper.js +++ b/webui/static/helper.js @@ -3416,6 +3416,7 @@ const WHATS_NEW = { '2.4.3': [ // --- post-release patch work on the 2.4.3 line — entries hidden by _getLatestWhatsNewVersion until the build version bumps --- { date: 'Unreleased — 2.4.3 patch work' }, + { title: 'Auto-Import: Genre Tags Land On The Artists Row + ISRC/MBID Type Hardening', desc: 'small followup to the standalone-library parity commit. (1) auto-import now reads the GENRE tag from each matched audio file (mutagen easy mode, supports flac / mp3 / m4a) and aggregates the deduped set across the album onto the new artists row\'s genres column. matches what soulsync_client._scan_transfer would have written if you\'d done a fresh deep scan after the import — your imported artists no longer feel hollow compared to plex / jellyfin / navidrome scans. dedup is case-insensitive but preserves original casing + insertion order so the json column reads naturally ("Hip-Hop, Rap, Trap" not "hip-hop, rap, trap"). (2) defensive `str()` cast on the worker\'s isrc + mbid extraction. metadata source clients all coerce to string today via `_build_album_track_entry`, but if a future source ever returned int / None for either id the side-effects layer would crash on `.strip()`. cheap insurance. 3 new tests pin: genre aggregation produces deduped insertion-order list, empty when no GENRE tags, isrc/mbid hostile-type input (int, None) coerced to safe string before propagation.', page: 'import' }, { title: 'Auto-Import: SoulSync Standalone Library Now Gets Full Server-Quality Rows', desc: 'soulsync standalone is meant to be a full replacement for plex / jellyfin / navidrome — the imported tracks should land in the db with the same field richness a media server scan would write. they weren\'t. the auto-import context dict (the payload it handed to the post-process pipeline) had no `source` field anywhere, so `record_soulsync_library_entry` couldn\'t pick the right source-id column on the new tracks/albums/artists rows. result: every auto-imported track landed with NULL on `spotify_track_id` / `deezer_id` / `itunes_track_id` / etc. — watchlist scans (which match by stable source IDs) couldn\'t recognise these tracks as already in library and would re-download them on the next pass. fixed by threading `identification[\'source\']` onto the top-level context, plus per-recording IDs (`isrc`, `musicbrainz_recording_id`) onto track_info so picard-tagged libraries land their per-recording metadata directly. also extracted the artist source ID from the metadata source\'s search response (`_search_metadata_source` and `_search_single_track` now pull `best_result.artists[0][\'id\']`) and threaded it through identification → context → standalone library write, so the artists row finally gets its source-ID column populated instead of staying NULL forever. also added `_download_username=\'auto_import\'` so library history shows "Auto-Import" instead of mislabeling every staging import as "Soulseek" (the fallback default), and an "auto_import" → "Auto-Import" mapping in the source-map dicts at side_effects.py to honour it. record_soulsync_library_entry tracks INSERT now also writes `musicbrainz_recording_id` + `isrc` columns directly (matches the navidrome scanner write path). 17 new tests pin: auto-import context carries source for every metadata source (spotify/deezer/itunes/discogs), `_download_username=auto_import`, isrc + mbid pass-through to track_info, album-id back-reference on track_info, artist source-id flows from identification → context (and not from album_id, the prior copy-paste bug), `_search_metadata_source` extracts artist_id from search response, soulsync library writes mbid + isrc to dedicated columns, deezer source maps to deezer_id column, library history + provenance use Auto-Import / auto_import labels.', page: 'import' }, { title: 'Auto-Import: Process Multiple Albums At Once', desc: 'auto-import used to process one album at a time. drop 5 albums into staging → wait for the first to fully finish (identify + match + every track post-processed) before the second one even starts. on a slow network or with a big batch this means 30+ minutes of staring at "Processing AlbumOne" while the others sit untouched. now there\'s a small bounded thread pool (3 workers by default, configurable) — up to 3 albums process in parallel, the queue moves through the rest as workers free up. clicking "Scan Now" multiple times no longer spawns extra unbounded scan threads — every trigger (timer + manual button) routes through one shared scan lock so duplicate triggers no-op instead of stacking up. live progress widget on the auto-import card now lists EACH in-flight album with its own track index/total/name instead of one shared scalar that the parallel workers used to stomp on each other. graceful shutdown: stopping the worker waits for in-flight pool work to finish before reporting stopped — no half-moved files or partial DB writes mid-album. stats counters (`scanned` / `auto_processed` / `pending_review` / `failed`) now use a lock so parallel workers don\'t lose increments under load. 17 new tests pin: pool size config, scan lock dedup, executor dispatch + bounded parallelism, cross-trigger candidate dedup, graceful shutdown, per-candidate UI state isolation across parallel workers, stats counter thread-safety, and snapshot consistency.', page: 'import' }, { title: 'Manual Search In The Failed-Track Candidates Modal', desc: 'when a download fails or returns "not found" the user can already click the status cell to open a modal showing whatever search candidates the auto-search left over and pick a different one. that modal now ALSO has a manual search bar. type any query, hit search, get a fresh round of results from the download sources without having to start the whole download flow over from the search page. solves the case where the auto-query was bad (featured artist not in title, parentheticals like "(remastered 2019)" tripping the matcher, slight artist-name variants) but the file genuinely exists on the source. source picker is smart per download mode: single-source mode (soulseek-only / youtube-only / etc) shows a "searching X" label, no dropdown; hybrid mode shows a dropdown with "all sources" default plus every configured source — picking "all" runs parallel searches across all of them and tags each result row with its source badge. only configured sources show up; unconfigured ones are hidden. results stream in as each source completes via NDJSON instead of blocking on the slowest source — the table starts populating the moment the first source returns. clicking a result reuses the existing retry-download flow → same path, same acoustid verification on the file when it lands, no shortcut around the safety net. additive in the truest sense: the existing modal layout / candidates table / download buttons are byte-identical when the user doesn\'t use manual search. backend extends the candidates endpoint with `download_mode` + `available_sources` + a `source` field per candidate (purely additive — old fields untouched), and adds a new `POST /api/downloads/task//manual-search` that streams NDJSON (one header line, one source_results line per source as completed, one done terminator) so the frontend renderer can append rows incrementally. 11 tests pin the streaming contract: query length / source whitelist / task 404 validation, single-source dispatch, parallel "all" dispatch, one-event-per-source streaming shape, unconfigured-source skip + reject, header metadata, and per-source exception isolation (one source raising emits a `source_error` event but doesn\'t fail the stream).', page: 'downloads' },