From 59992d42a8cd70b56ed169c23da8e1a9c7e8a85d Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Sun, 10 May 2026 09:16:13 -0700 Subject: [PATCH] Deezer search: free-text fallback when advanced query returns 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defensive followup to the relevance fix. Deezer's advanced search syntax (`artist:"X"`) is documented as substring match, but in practice it's brittle on artist name variants ("Foreigner [US]", "The Foreigner") and on tracks indexed under non-canonical title spellings. When the advanced query returns nothing, we'd previously land at "No matches" — a regression vs. pre-fix behaviour where free-text would have returned a less-relevant but non-empty set. Fix: when the advanced query returns 0 results AND the caller used field-scoped kwargs, fall back to a free-text join of the same kwargs and re-query. Caller-side rerank still tightens whatever the fallback returns, so the worst-case post-fix behaviour is the pre-fix behaviour — never strictly worse. Pulled the cache + parse + store dance into a private helper (`_search_tracks_with_query`) so the orchestration can call it twice (advanced → fallback) without code duplication. Single API call when the advanced query has results — no wasted requests. Diagnostic logger.debug fires when the fallback triggers so we can see in production whether it's happening (and to which queries). # Tests added (4) - `test_falls_back_to_free_text_when_advanced_empty` — advanced query returns 0, free-text returns hits; client returns the free-text hits + both API calls fire. - `test_no_fallback_when_advanced_query_has_results` — single hit on advanced query → no second API call. - `test_no_fallback_when_legacy_free_text_call` — legacy callers already exhausted the only path; empty result is final. - `test_no_fallback_when_query_unchanged` — empty kwargs path doesn't trigger the fallback branch (used_advanced=False). # Existing tests updated The 4 prior `TestSearchTracksQueryWiring` + `TestSearchTracksCacheKey` tests were stubbing `_api_get` to return empty `{'data': []}` and asserting `assert_called_once`. With the new fallback, those stubs trigger a second API call and the assertions break — even though the FIRST call construction is what the tests cared about. Updated the stubs to return one fake hit so the fallback doesn't fire, and switched to `call_args_list[0]` for first-call inspection. # Verification - 18/18 deezer query tests pass (14 prior + 4 new) - 2445 full suite passes (+4 from prior commit) - Ruff clean --- core/deezer_client.py | 30 ++++- tests/metadata/test_deezer_search_query.py | 127 +++++++++++++++++++-- webui/static/helper.js | 2 +- 3 files changed, 148 insertions(+), 11 deletions(-) diff --git a/core/deezer_client.py b/core/deezer_client.py index 663a8b37..71265a12 100644 --- a/core/deezer_client.py +++ b/core/deezer_client.py @@ -332,7 +332,8 @@ class DeezerClient: """ # Build the actual API query — advanced syntax when callers pass # field hints, raw query otherwise. - if track or artist or album: + used_advanced = bool(track or artist or album) + if used_advanced: api_query = self._build_advanced_query(track=track, artist=artist, album=album) else: api_query = query @@ -340,6 +341,33 @@ class DeezerClient: if not api_query: return [] + tracks = self._search_tracks_with_query(api_query, limit) + + # Safety net: Deezer's advanced syntax is `artist:"X"`-style + # substring match, but in practice it's brittle on artist name + # variants ("Foreigner [US]", "The Foreigner", etc.) and on + # tracks indexed under non-canonical title spellings. When the + # advanced query returns nothing, fall back to a free-text join + # so the user sees the prior (less-relevant but non-empty) result + # set rather than "No matches" — same behaviour as pre-fix for + # this edge case. Caller-side rerank still tightens the result. + if not tracks and used_advanced: + fallback_parts = [p for p in (track, artist, album) if p] + fallback_query = ' '.join(fallback_parts) + if fallback_query and fallback_query != api_query: + logger.debug( + "[Deezer] Advanced query returned 0 results, falling back " + "to free-text: %r → %r", api_query, fallback_query, + ) + tracks = self._search_tracks_with_query(fallback_query, limit) + + return tracks + + def _search_tracks_with_query(self, api_query: str, limit: int) -> List[Track]: + """Cache-aware single API call. Pulled out so the + ``search_tracks`` orchestration can call this twice (advanced + query → free-text fallback) without duplicating the cache + + parse + store dance.""" cache = get_metadata_cache() cached_results = cache.get_search_results('deezer', 'track', api_query, limit) if cached_results is not None: diff --git a/tests/metadata/test_deezer_search_query.py b/tests/metadata/test_deezer_search_query.py index e7f007df..aafc3cc3 100644 --- a/tests/metadata/test_deezer_search_query.py +++ b/tests/metadata/test_deezer_search_query.py @@ -68,8 +68,17 @@ class TestBuildAdvancedQuery: class TestSearchTracksQueryWiring: def _client(self): c = DeezerClient.__new__(DeezerClient) - # Stub state needed by _api_get's downstream methods - c._api_get = MagicMock(return_value={'data': []}) + # Stub state needed by _api_get's downstream methods. Returns + # one fake hit so the empty-result fallback (which would + # double the API calls) doesn't fire — these tests only care + # about the FIRST call's query construction. + c._api_get = MagicMock(return_value={ + 'data': [{ + 'id': 1, 'title': 'X', 'duration': 200, + 'artist': {'id': 2, 'name': 'A'}, + 'album': {'id': 3, 'title': 'B'}, + }], + }) return c def _stub_cache(self, monkeypatch): @@ -90,8 +99,9 @@ class TestSearchTracksQueryWiring: c.search_tracks(track='Dirty White Boy', artist='Foreigner') - c._api_get.assert_called_once() - params = c._api_get.call_args.args[1] + # Stubbed API returns a hit so fallback doesn't fire; first + # (and only) call uses advanced syntax. + params = c._api_get.call_args_list[0].args[1] assert params['q'] == 'track:"Dirty White Boy" artist:"Foreigner"', ( f"Expected advanced-syntax query string, got {params['q']!r}" ) @@ -120,7 +130,8 @@ class TestSearchTracksQueryWiring: c.search_tracks(query='ignored free text', track='Dirty White Boy', artist='Foreigner') - params = c._api_get.call_args.args[1] + # First call uses advanced syntax (kwargs win over query). + params = c._api_get.call_args_list[0].args[1] assert 'track:' in params['q'] assert 'ignored' not in params['q'] @@ -142,7 +153,7 @@ class TestSearchTracksQueryWiring: c.search_tracks(album='Head Games') - params = c._api_get.call_args.args[1] + params = c._api_get.call_args_list[0].args[1] assert params['q'] == 'album:"Head Games"' def test_limit_parameter_passed_through(self, monkeypatch): @@ -151,7 +162,7 @@ class TestSearchTracksQueryWiring: c.search_tracks(track='X', artist='Y', limit=50) - params = c._api_get.call_args.args[1] + params = c._api_get.call_args_list[0].args[1] assert params['limit'] == 50 def test_limit_clamped_to_100(self, monkeypatch): @@ -163,7 +174,7 @@ class TestSearchTracksQueryWiring: c.search_tracks(track='X', limit=500) - params = c._api_get.call_args.args[1] + params = c._api_get.call_args_list[0].args[1] assert params['limit'] == 100 @@ -173,6 +184,96 @@ class TestSearchTracksQueryWiring: # --------------------------------------------------------------------------- +# --------------------------------------------------------------------------- +# Free-text fallback when advanced query returns 0 results +# --------------------------------------------------------------------------- + + +class TestSearchTracksAdvancedQueryFallback: + """Defensive fallback: Deezer's advanced syntax is `artist:"X"`- + style substring match, but in practice it's brittle on artist + name variants ("Foreigner [US]", "The Foreigner", etc.) and on + tracks indexed under non-canonical title spellings. When the + advanced query returns nothing, fall back to a free-text join so + the user sees the prior (less-relevant but non-empty) result set + rather than "No matches". + + Contract: pre-fix behaviour preserved on the empty-advanced-query + edge case. Caller-side rerank still tightens whatever the + fallback returns. + """ + + def _client_with_responses(self, monkeypatch, responses): + """Stub `_api_get` to return `responses` in sequence (FIFO). + Lets the test simulate "advanced empty, free-text non-empty".""" + cache = MagicMock() + cache.get_search_results.return_value = None + monkeypatch.setattr('core.deezer_client.get_metadata_cache', lambda: cache) + + c = DeezerClient.__new__(DeezerClient) + call_log = [] + + def fake_api_get(_path, params): + call_log.append(params['q']) + return responses.pop(0) if responses else None + + c._api_get = fake_api_get + c._call_log = call_log + return c + + def test_falls_back_to_free_text_when_advanced_empty(self, monkeypatch): + c = self._client_with_responses(monkeypatch, [ + {'data': []}, # advanced query — 0 results + {'data': [{'id': 99, 'title': 'Found It', 'duration': 200, + 'artist': {'id': 1, 'name': 'Foreigner'}, + 'album': {'id': 2, 'title': 'X'}}]}, # free-text — has results + ]) + results = c.search_tracks(track='Dirty White Boy', artist='Foreigner [US]') + + assert len(results) == 1 + assert results[0].name == 'Found It' + # First call was the advanced query, second was the free-text fallback + assert c._call_log[0] == 'track:"Dirty White Boy" artist:"Foreigner [US]"' + assert c._call_log[1] == 'Dirty White Boy Foreigner [US]' + + def test_no_fallback_when_advanced_query_has_results(self, monkeypatch): + """Don't waste an extra API call when the advanced query + already returned something — even a single result counts as + a hit, no fallback needed.""" + c = self._client_with_responses(monkeypatch, [ + {'data': [{'id': 99, 'title': 'Found', 'duration': 200, + 'artist': {'id': 1, 'name': 'Foreigner'}, + 'album': {'id': 2, 'title': 'X'}}]}, + ]) + results = c.search_tracks(track='X', artist='Foreigner') + + assert len(results) == 1 + assert len(c._call_log) == 1, "Should not have hit the API twice" + + def test_no_fallback_when_legacy_free_text_call(self, monkeypatch): + """Free-text caller already exhausted the only path — no + secondary fallback exists. Empty result is final.""" + c = self._client_with_responses(monkeypatch, [{'data': []}]) + results = c.search_tracks('legacy free text') + + assert results == [] + assert len(c._call_log) == 1 + + def test_no_fallback_when_query_unchanged(self, monkeypatch): + """If the constructed advanced query happens to equal the + free-text join (e.g. caller passed only `track=` with a + single word), don't waste an identical second API call.""" + c = self._client_with_responses(monkeypatch, [{'data': []}]) + # Single-word track-only — advanced query is `track:"X"`, + # free-text would be `X`. Different strings, fallback fires. + # Skip this case; instead test the no-op-when-equal path + # directly: empty kwargs trio means used_advanced=False, + # we never enter the fallback branch. + results = c.search_tracks(query='same') + assert results == [] + assert len(c._call_log) == 1 + + class TestSearchTracksCacheKey: def test_field_scoped_call_uses_advanced_query_as_cache_key(self, monkeypatch): """Cache key is the constructed query string, NOT the raw @@ -183,7 +284,15 @@ class TestSearchTracksCacheKey: monkeypatch.setattr('core.deezer_client.get_metadata_cache', lambda: cache) c = DeezerClient.__new__(DeezerClient) - c._api_get = MagicMock(return_value={'data': []}) + # Non-empty stub so the empty-result fallback doesn't fire + + # double the cache lookups. + c._api_get = MagicMock(return_value={ + 'data': [{ + 'id': 1, 'title': 'X', 'duration': 200, + 'artist': {'id': 2, 'name': 'A'}, + 'album': {'id': 3, 'title': 'B'}, + }], + }) c.search_tracks(track='Dirty White Boy', artist='Foreigner', limit=20) diff --git a/webui/static/helper.js b/webui/static/helper.js index f4757733..c230b55c 100644 --- a/webui/static/helper.js +++ b/webui/static/helper.js @@ -3416,7 +3416,7 @@ const WHATS_NEW = { '2.4.3': [ // --- post-release patch work on the 2.4.3 line — entries hidden by _getLatestWhatsNewVersion until the build version bumps --- { date: 'Unreleased — 2.4.3 patch work' }, - { title: 'Search For Match: No More Karaoke / Cover / "Originally Performed By" Junk At The Top', desc: 'github issue #534 (radoslav-orlov): typing "dirty white boy" + "foreigner" into the import-modal "search for match" dialog returned re-recordings, karaoke versions, "originally performed by" compilations, and tribute-band cuts ranked above the actual foreigner studio recording. user had to scroll past 5+ junk results before finding the canonical track. cause: deezer endpoint joined track + artist into a single free-text string and passed that to deezer\'s `q` param — which fuzzy-matches across title / lyrics / artist / album / contributors and orders by global popularity, so anything that appears across many compilations outranks the canonical track. fix has two layers. (1) deezer client now supports field-scoped kwargs (`track="X" artist="Y"`) which build deezer\'s advanced search syntax `track:"X" artist:"Y"` — massively tighter relevance because each term matches the right field instead of fuzzy-matching everywhere. backward compat preserved: legacy free-text callers still work. (2) new `core/metadata/relevance.py` helper reranks results locally with cover/karaoke/tribute/re-recorded penalties + exact-artist-match boost + variant-tag (live/acoustic/remix) penalty (skipped when user explicitly typed the variant). applied at the deezer + itunes + spotify search-tracks endpoints so all three sources behave consistently from the user\'s perspective. variant penalty only fires when user did NOT ask for the variant — searching "track (live)" still ranks live versions correctly. 71 new tests pin every component: pattern detection (10 cover patterns, 8 variant patterns, 3 fields), score composition (real-cut > karaoke > re-recorded), the issue #534 screenshot reproduced as a regression test, deezer client query construction, three search-modal endpoints end-to-end.', page: 'import' }, + { title: 'Search For Match: No More Karaoke / Cover / "Originally Performed By" Junk At The Top', desc: 'github issue #534 (radoslav-orlov): typing "dirty white boy" + "foreigner" into the import-modal "search for match" dialog returned re-recordings, karaoke versions, "originally performed by" compilations, and tribute-band cuts ranked above the actual foreigner studio recording. user had to scroll past 5+ junk results before finding the canonical track. cause: deezer endpoint joined track + artist into a single free-text string and passed that to deezer\'s `q` param — which fuzzy-matches across title / lyrics / artist / album / contributors and orders by global popularity, so anything that appears across many compilations outranks the canonical track. fix has three layers. (1) deezer client now supports field-scoped kwargs (`track="X" artist="Y"`) which build deezer\'s advanced search syntax `track:"X" artist:"Y"` — massively tighter relevance because each term matches the right field instead of fuzzy-matching everywhere. backward compat preserved: legacy free-text callers still work. (2) new `core/metadata/relevance.py` helper reranks results locally with cover/karaoke/tribute/re-recorded penalties + exact-artist-match boost + variant-tag (live/acoustic/remix) penalty (skipped when user explicitly typed the variant). applied at the deezer + itunes + spotify search-tracks endpoints so all three sources behave consistently from the user\'s perspective. variant penalty only fires when user did NOT ask for the variant — searching "track (live)" still ranks live versions correctly. (3) safety net: when deezer\'s advanced query returns 0 results (sometimes happens on artist name variants like "foreigner [us]" or non-canonical title spellings), client falls back to free-text search so the user never sees an empty result list when the API would have returned the prior less-relevant set. caller-side rerank still tightens whatever the fallback returns. 75 new tests pin every component: pattern detection (10 cover patterns, 8 variant patterns, 3 fields), score composition (real-cut > karaoke > re-recorded), the issue #534 screenshot reproduced as a regression test, deezer client query construction + free-text fallback path, three search-modal endpoints end-to-end.', page: 'import' }, { title: 'Auto-Import: Album Duration Is Album Total + Re-Imports Fill Metadata Gaps', desc: 'two more parity gaps closed in the soulsync standalone library write path. (1) album row\'s `duration` column was being written with the FIRST imported track\'s duration instead of the album total — pre-existing bug that survived the prior parity commit. soulsync_client deep scan computes `sum(t.duration for t in self._tracks)` for each album; auto-import now mirrors that by computing the sum across every matched track in the worker and threading it through context to the album INSERT. (2) `record_soulsync_library_entry` was insert-only on artists + albums — once a row existed (matched by id OR name fallback), subsequent imports of the same artist or album skipped completely. meant: artist genres / thumb / source-id reflected ONLY whatever the FIRST imported album supplied, never refreshing as more albums by that artist landed (ten more deezer/spotify imports later, artist row still had whatever the first random import wrote). new conservative UPDATE path: when an existing row matches, fill ONLY the columns whose current value is NULL or empty — never overwrites populated values. protects manual edits + enrichment-worker writes the same way scanner UPDATEs preserve enrichment columns. f-string column names are validated against an allowlist (`_SOULSYNC_FILLABLE_COLUMNS`) before interpolation — defensive against accidental misuse adding columns without an allowlist update. 4 new tests pin: album duration uses sum not single-track, re-import fills empty thumb + genres on existing artist row, re-import does NOT clobber populated values, re-import fills empty source-id columns when later import has them.', page: 'import' }, { title: 'Auto-Import: Genre Tags Land On The Artists Row + ISRC/MBID Type Hardening', desc: 'small followup to the standalone-library parity commit. (1) auto-import now reads the GENRE tag from each matched audio file (mutagen easy mode, supports flac / mp3 / m4a) and aggregates the deduped set across the album onto the new artists row\'s genres column. matches what soulsync_client._scan_transfer would have written if you\'d done a fresh deep scan after the import — your imported artists no longer feel hollow compared to plex / jellyfin / navidrome scans. dedup is case-insensitive but preserves original casing + insertion order so the json column reads naturally ("Hip-Hop, Rap, Trap" not "hip-hop, rap, trap"). (2) defensive `str()` cast on the worker\'s isrc + mbid extraction. metadata source clients all coerce to string today via `_build_album_track_entry`, but if a future source ever returned int / None for either id the side-effects layer would crash on `.strip()`. cheap insurance. 3 new tests pin: genre aggregation produces deduped insertion-order list, empty when no GENRE tags, isrc/mbid hostile-type input (int, None) coerced to safe string before propagation.', page: 'import' }, { title: 'Auto-Import: SoulSync Standalone Library Now Gets Full Server-Quality Rows', desc: 'soulsync standalone is meant to be a full replacement for plex / jellyfin / navidrome — the imported tracks should land in the db with the same field richness a media server scan would write. they weren\'t. the auto-import context dict (the payload it handed to the post-process pipeline) had no `source` field anywhere, so `record_soulsync_library_entry` couldn\'t pick the right source-id column on the new tracks/albums/artists rows. result: every auto-imported track landed with NULL on `spotify_track_id` / `deezer_id` / `itunes_track_id` / etc. — watchlist scans (which match by stable source IDs) couldn\'t recognise these tracks as already in library and would re-download them on the next pass. fixed by threading `identification[\'source\']` onto the top-level context, plus per-recording IDs (`isrc`, `musicbrainz_recording_id`) onto track_info so picard-tagged libraries land their per-recording metadata directly. also extracted the artist source ID from the metadata source\'s search response (`_search_metadata_source` and `_search_single_track` now pull `best_result.artists[0][\'id\']`) and threaded it through identification → context → standalone library write, so the artists row finally gets its source-ID column populated instead of staying NULL forever. also added `_download_username=\'auto_import\'` so library history shows "Auto-Import" instead of mislabeling every staging import as "Soulseek" (the fallback default), and an "auto_import" → "Auto-Import" mapping in the source-map dicts at side_effects.py to honour it. record_soulsync_library_entry tracks INSERT now also writes `musicbrainz_recording_id` + `isrc` columns directly (matches the navidrome scanner write path). 17 new tests pin: auto-import context carries source for every metadata source (spotify/deezer/itunes/discogs), `_download_username=auto_import`, isrc + mbid pass-through to track_info, album-id back-reference on track_info, artist source-id flows from identification → context (and not from album_id, the prior copy-paste bug), `_search_metadata_source` extracts artist_id from search response, soulsync library writes mbid + isrc to dedicated columns, deezer source maps to deezer_id column, library history + provenance use Auto-Import / auto_import labels.', page: 'import' },