Merge pull request #485 from Nezreka/fix/integrity-rejection-marks-task-failed

Fix: tasks showed Completed when file was quarantined
pull/486/head
BoulderBadgeDad 3 weeks ago committed by GitHub
commit fbf4bad47a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -356,27 +356,61 @@ class AcoustIDVerification:
return VerificationResult.PASS, msg
# No match found — but if fingerprint score is very high (≥0.95)
# AND there's partial similarity in title or artist, the mismatch is
# likely a language/script difference (e.g. Japanese kanji vs English).
# Skip rather than quarantine a correct file.
# But if both title AND artist similarity are very low, the download
# source gave us a completely wrong file — fail it.
if best_score >= 0.95 and (title_sim >= 0.55 or artist_sim >= ARTIST_MATCH_THRESHOLD):
top = recordings[0]
# AND we have evidence the mismatch is a language/script case
# (rather than two genuinely different songs by the same artist),
# skip rather than quarantine a correct file. Two routes:
#
# (a) Either side of the comparison contains non-ASCII characters
# — strong signal of transliteration / kanji↔roman cases.
# Artist must still be a strong match to use this path.
# (b) Both title AND artist similarity are very high (the song
# is recognizably the same with minor punctuation / casing
# differences that fell below the strict match thresholds).
#
# The OLD logic was ``title_sim >= 0.55 OR artist_sim >= match``.
# That fired for English-vs-English songs by the same artist that
# share NO actual content — e.g. "R.O.T.C (Interlude)" by
# Kendrick Lamar getting accepted as "Rich (Interlude)" by
# Kendrick Lamar because the artist matched perfectly and
# "interlude" was shared in both titles. Reported by user when
# downloading Mr. Morale: three tracks (Rich Interlude, Savior
# Interlude, Savior) all received the wrong R.O.T.C audio file
# because of this leak.
top = recordings[0]
top_title = top.get('title', '?') or ''
top_artist = top.get('artist', '?') or ''
has_non_ascii = (
any(ord(c) > 127 for c in (expected_track_name or ''))
or any(ord(c) > 127 for c in top_title)
)
language_script_skip = (
best_score >= 0.95
and has_non_ascii
and artist_sim >= ARTIST_MATCH_THRESHOLD
)
high_confidence_strong_match_skip = (
best_score >= 0.95
and title_sim >= 0.80
and artist_sim >= ARTIST_MATCH_THRESHOLD
)
if language_script_skip or high_confidence_strong_match_skip:
reason = (
"likely same song in different language/script"
if language_script_skip
else "title/artist match within tolerance"
)
msg = (
f"Title/artist mismatch but fingerprint confidence very high ({best_score:.2f}): "
f"AcoustID='{top.get('title', '?')}' by '{top.get('artist', '?')}', "
f"AcoustID='{top_title}' by '{top_artist}', "
f"expected '{expected_track_name}' by '{expected_artist_name}'"
f"likely same song in different language/script"
f"{reason}"
)
logger.info(f"AcoustID verification SKIPPED (high confidence) - {msg}")
return VerificationResult.SKIP, msg
# Low fingerprint score + no metadata match — file is likely wrong
top = recordings[0]
top_title = top.get('title', '?')
top_artist = top.get('artist', '?')
# `top`, `top_title`, `top_artist` already resolved above for the
# skip-eligibility check.
msg = (
f"Audio mismatch: file identified as '{top_title}' by '{top_artist}', "
f"expected '{expected_track_name}' by '{expected_artist_name}' "

@ -927,6 +927,50 @@ def post_process_matched_download_with_verification(context_key, context, file_p
_notify_download_completed(batch_id, task_id, success=False)
return
# Integrity rejection — the inner pipeline quarantined the file
# because audio integrity (size / parse / duration) failed. Wrapper
# was previously falling through to "assuming success" because
# quarantined files have no _final_processed_path, which left the
# task showing ✅ Completed in the UI even though the file is in
# quarantine. Reported by user when downloading Mr. Morale: 3
# tracks (Rich Interlude, Savior Interlude, Savior) showed
# Completed in the modal but were missing on disk because their
# source files failed integrity and were quarantined.
if context.get('_integrity_failure_msg'):
failure_msg = context.get('_integrity_failure_msg', 'unknown')
logger.error(
f"Task {task_id} failed integrity check — marking failed: {failure_msg}"
)
with tasks_lock:
if task_id in download_tasks:
download_tasks[task_id]['status'] = 'failed'
download_tasks[task_id]['error_message'] = (
f"File integrity check failed: {failure_msg}"
)
with matched_context_lock:
if context_key in matched_downloads_context:
del matched_downloads_context[context_key]
_notify_download_completed(batch_id, task_id, success=False)
return
# Race guard failure — inner code set this when the source file
# disappeared and there was no known destination to fall back on
# (vs the legitimate race-guard skip where a sibling thread
# already moved the file to its destination).
if context.get('_race_guard_failed'):
logger.error(f"Task {task_id} failed race guard — source file gone with no known destination")
with tasks_lock:
if task_id in download_tasks:
download_tasks[task_id]['status'] = 'failed'
download_tasks[task_id]['error_message'] = (
"Source file disappeared before post-processing could complete"
)
with matched_context_lock:
if context_key in matched_downloads_context:
del matched_downloads_context[context_key]
_notify_download_completed(batch_id, task_id, success=False)
return
expected_final_path = context.get('_final_processed_path')
if not expected_final_path:
logger.info(f"No _final_processed_path in context for task {task_id} — cannot verify, assuming success")

@ -0,0 +1,204 @@
"""Tighten the AcoustID "language/script" skip exemption.
User report (Mr. Morale download): three different track requests
(Rich Interlude, Savior Interlude, Savior) each received the same
WRONG audio file (Kendrick's R.O.T.C Interlude from his 2010 mixtape).
AcoustID flagged the title mismatch but the verification logic
SKIPPED rather than FAILED with the reason "likely same song in
different language/script."
The old condition was:
best_score >= 0.95 AND (title_sim >= 0.55 OR artist_sim >= match)
That OR-clause fired for English-vs-English titles by the same artist
that share NO actual content same artist + word "interlude" in both
titles cleared the bar. The skip then trusted the wrong file as
correct.
New condition: only skip when there's positive evidence the mismatch
is a transliteration / language-script case:
- (a) Either side of the comparison contains non-ASCII characters AND
artist matches strongly. Real cases: Japanese kanji romaji,
Korean hangul romaji, etc.
- (b) BOTH title AND artist similarity are very high (>=0.80, ARTIST
threshold). Real cases: title differs only by punctuation /
casing that fell below strict-match thresholds.
For English-vs-English with very different titles by the same artist,
the skip no longer fires verification correctly returns FAIL,
quarantining the wrong file.
"""
from __future__ import annotations
from unittest.mock import patch
import pytest
from core.acoustid_verification import (
AcoustIDVerification,
VerificationResult,
)
@pytest.fixture
def verifier(monkeypatch):
"""A verifier with the network/fingerprint side stubbed so we can
drive the title/artist comparison logic directly."""
v = AcoustIDVerification()
# Stub availability check to avoid touching real chromaprint
class _StubClient:
def is_available(self):
return True, 'available'
def fingerprint_and_lookup(self, path):
# Each test injects its own desired return value via
# monkeypatch on this method; default is empty.
return None
v.acoustid_client = _StubClient()
return v
def _stub_lookup(verifier, *, recordings, best_score):
"""Make `fingerprint_and_lookup` return a fabricated AcoustID result."""
verifier.acoustid_client.fingerprint_and_lookup = lambda path: {
'recordings': recordings,
'best_score': best_score,
'recording_mbids': [r.get('id') for r in recordings if r.get('id')],
}
# ---------------------------------------------------------------------------
# The headline regression — Rich Interlude vs R.O.T.C Interlude
# ---------------------------------------------------------------------------
def test_english_titles_same_artist_no_longer_skipped(verifier):
"""User's actual case: requested 'Rich (Interlude)' by Kendrick
Lamar, AcoustID identified the file as 'R.O.T.C. (interlude)' by
Kendrick Lamar. Same artist, same word 'interlude', but completely
different songs. Old skip-logic let it pass; new logic must FAIL
so the file gets quarantined."""
_stub_lookup(verifier, recordings=[
{'title': 'R.O.T.C. (interlude)', 'artist': 'Kendrick Lamar feat. BJ the Chicago Kid'},
], best_score=0.96)
result, msg = verifier.verify_audio_file(
'/fake/path.flac',
'Rich (Interlude)',
'Kendrick Lamar',
)
assert result == VerificationResult.FAIL
# Message should be the wrong-file message, NOT the language/script skip
assert 'mismatch' in msg.lower()
assert 'language/script' not in msg.lower()
def test_savior_request_returning_rotc_no_longer_skipped(verifier):
"""Same bug surface, different track. Confirms the fix isn't
Rich-Interlude-specific."""
_stub_lookup(verifier, recordings=[
{'title': 'R.O.T.C. (interlude)', 'artist': 'Kendrick Lamar feat. BJ the Chicago Kid'},
], best_score=0.96)
result, _msg = verifier.verify_audio_file(
'/fake/path.flac',
'Savior',
'Kendrick Lamar',
)
assert result == VerificationResult.FAIL
# ---------------------------------------------------------------------------
# The legitimate skip cases — must STILL fire
# ---------------------------------------------------------------------------
def test_japanese_kanji_to_romaji_still_skipped(verifier):
"""Real language/script case: AcoustID's database has the kanji
title, the user requested the romaji version. Same artist (in
Latin script), high fingerprint confidence. Skip should still
fire so a correct file isn't false-quarantined."""
_stub_lookup(verifier, recordings=[
{'title': '残酷な天使のテーゼ', 'artist': 'Yoko Takahashi'},
], best_score=0.97)
result, msg = verifier.verify_audio_file(
'/fake/path.flac',
'Zankoku na Tenshi no Theze',
'Yoko Takahashi',
)
assert result == VerificationResult.SKIP
assert 'language/script' in msg.lower()
def test_minor_punctuation_difference_passes_outright(verifier):
"""Punctuation-only difference: both 'MAAD' and 'M.A.A.D' normalize
similarly enough that the strict TITLE_MATCH_THRESHOLD is met and
verification PASSES (better outcome than SKIP). Pin this so a
future tightening of the strict thresholds doesn't accidentally
push these into the FAIL bucket."""
_stub_lookup(verifier, recordings=[
{'title': 'M.A.A.D City', 'artist': 'Kendrick Lamar'},
], best_score=0.97)
result, _msg = verifier.verify_audio_file(
'/fake/path.flac',
'MAAD City',
'Kendrick Lamar',
)
# PASS or SKIP both fine — the critical assertion is "not FAIL".
assert result != VerificationResult.FAIL
def test_low_fingerprint_score_never_skipped(verifier):
"""Below the 0.95 confidence floor, the skip exemption should
never fire even for plausibly-real language/script cases. We
don't have enough signal to be sure the audio matches."""
_stub_lookup(verifier, recordings=[
{'title': '残酷な天使のテーゼ', 'artist': 'Yoko Takahashi'},
], best_score=0.80) # below 0.95 floor
result, _msg = verifier.verify_audio_file(
'/fake/path.flac',
'Zankoku na Tenshi no Theze',
'Yoko Takahashi',
)
assert result == VerificationResult.FAIL
def test_high_score_but_artist_mismatch_no_longer_skipped(verifier):
"""Even with high fingerprint AND non-ASCII chars present, if the
artist DOESN'T match well, we don't have enough signal to skip.
Could be a cover by a different artist."""
_stub_lookup(verifier, recordings=[
{'title': '残酷な天使のテーゼ', 'artist': 'Some Other Singer'},
], best_score=0.97)
result, _msg = verifier.verify_audio_file(
'/fake/path.flac',
'Zankoku na Tenshi no Theze',
'Yoko Takahashi',
)
assert result == VerificationResult.FAIL
def test_old_loose_threshold_no_longer_fires_for_unrelated_titles(verifier):
"""Pin the negative case for the old loose threshold (title_sim
>= 0.55). 'Crown' vs 'Crown of Thorns' had similarity around 0.6
in some normalizations under old logic with high confidence
and matching artist that would skip. New logic requires title_sim
>= 0.80 OR non-ASCII presence."""
_stub_lookup(verifier, recordings=[
{'title': 'Crown of Thorns', 'artist': 'Kendrick Lamar'},
], best_score=0.96)
result, _msg = verifier.verify_audio_file(
'/fake/path.flac',
'Crown',
'Kendrick Lamar',
)
# User asked for 'Crown', got 'Crown of Thorns' — should FAIL now
assert result == VerificationResult.FAIL

@ -0,0 +1,202 @@
"""Pin the contract: integrity rejection must mark the task as failed.
User report (Mr. Morale download): three tracks (Rich Interlude,
Savior Interlude, Savior) showed Completed in the modal but were
missing from disk. Log trace at line 932 of `core/imports/pipeline.py`
revealed the bug:
No _final_processed_path in context for task <id> cannot verify, assuming success
Inner ``post_process_matched_download`` quarantined the source file
(integrity check rejected duration mismatch on a wrong-content file),
which left no ``_final_processed_path`` in the context. The outer
verification wrapper saw no path and fell through to the "assuming
success" branch, marking the task as ✅ Completed even though the file
was in quarantine and would never reach the destination.
Fix: the wrapper now explicitly checks for ``_integrity_failure_msg``
and ``_race_guard_failed`` markers BEFORE the "assume success" branch.
If any failure marker is set, the task is marked failed with a
descriptive error message and the batch tracker is notified with
``success=False``.
"""
from __future__ import annotations
import threading
import types
from unittest.mock import patch
import pytest
import core.imports.pipeline as import_pipeline
import core.runtime_state as runtime_state
# ---------------------------------------------------------------------------
# Test scaffolding
# ---------------------------------------------------------------------------
@pytest.fixture
def _isolate_state():
"""Snapshot + restore the global runtime maps so this test can mutate
them without polluting other tests."""
snapshot = {
'tasks': dict(runtime_state.download_tasks),
'batches': dict(runtime_state.download_batches),
'matched_ctx': dict(runtime_state.matched_downloads_context),
}
runtime_state.download_tasks.clear()
runtime_state.download_batches.clear()
runtime_state.matched_downloads_context.clear()
yield
runtime_state.download_tasks.clear()
runtime_state.download_tasks.update(snapshot['tasks'])
runtime_state.download_batches.clear()
runtime_state.download_batches.update(snapshot['batches'])
runtime_state.matched_downloads_context.clear()
runtime_state.matched_downloads_context.update(snapshot['matched_ctx'])
def _build_runtime(completion_calls):
return types.SimpleNamespace(
automation_engine=None,
on_download_completed=lambda batch, task, success: completion_calls.append(
(batch, task, success)
),
web_scan_manager=None,
repair_worker=None,
)
def _seed_task(task_id: str = 't1', batch_id: str = 'b1') -> None:
runtime_state.download_tasks[task_id] = {
'task_id': task_id,
'batch_id': batch_id,
'status': 'downloading',
'track_info': {'name': 'Rich (Interlude)'},
}
# ---------------------------------------------------------------------------
# The wrapper-level fix
# ---------------------------------------------------------------------------
def test_integrity_failure_marker_marks_task_failed(_isolate_state):
"""When inner code sets ``_integrity_failure_msg``, the wrapper
must mark the task failed NOT fall through to "assume success"."""
completion_calls = []
runtime = _build_runtime(completion_calls)
_seed_task('t1', 'b1')
context = {
'task_id': 't1',
'batch_id': 'b1',
'context_key': 'test::ctx',
# Simulate inner code's integrity-rejection state — file went to
# quarantine, _final_processed_path NEVER got set.
'_integrity_failure_msg': 'Duration mismatch: file is 163s, expected 152s (drift 11s)',
}
# Inner post-processor is a no-op for this test — we're verifying the
# wrapper-level state machine. Stub everything inside `with_verification`
# that would otherwise touch real disk / acoustid / etc.
with patch.object(import_pipeline, 'post_process_matched_download',
lambda *a, **kw: None):
import_pipeline.post_process_matched_download_with_verification(
'test::ctx', context, '/fake/source.flac', 't1', 'b1', runtime,
)
# Task explicitly marked failed with the integrity error message
assert runtime_state.download_tasks['t1']['status'] == 'failed'
assert 'integrity' in runtime_state.download_tasks['t1']['error_message'].lower()
# Batch tracker notified with success=False
assert ('b1', 't1', False) in completion_calls
# Did NOT fall through to "assume success"
assert ('b1', 't1', True) not in completion_calls
def test_race_guard_failure_marker_marks_task_failed(_isolate_state):
"""Same contract for the race-guard-failed marker (source file
disappeared with no known destination)."""
completion_calls = []
runtime = _build_runtime(completion_calls)
_seed_task('t2', 'b2')
context = {
'task_id': 't2',
'batch_id': 'b2',
'context_key': 'test::ctx2',
'_race_guard_failed': True,
}
with patch.object(import_pipeline, 'post_process_matched_download',
lambda *a, **kw: None):
import_pipeline.post_process_matched_download_with_verification(
'test::ctx2', context, '/fake/source.flac', 't2', 'b2', runtime,
)
assert runtime_state.download_tasks['t2']['status'] == 'failed'
assert ('b2', 't2', False) in completion_calls
def test_no_failure_markers_still_assumes_success(_isolate_state):
"""The pre-existing "assume success" fallback must STILL fire when
no failure markers are set some legitimate flows complete without
setting `_final_processed_path`. Don't regress that behavior."""
completion_calls = []
runtime = _build_runtime(completion_calls)
_seed_task('t3', 'b3')
context = {
'task_id': 't3',
'batch_id': 'b3',
'context_key': 'test::ctx3',
# No failure markers, no _final_processed_path
}
with patch.object(import_pipeline, 'post_process_matched_download',
lambda *a, **kw: None), \
patch.object(import_pipeline, '_mark_task_completed',
lambda task_id, ti: runtime_state.download_tasks[task_id].update(
{'status': 'completed'}
)):
import_pipeline.post_process_matched_download_with_verification(
'test::ctx3', context, '/fake/source.flac', 't3', 'b3', runtime,
)
assert runtime_state.download_tasks['t3']['status'] == 'completed'
assert ('b3', 't3', True) in completion_calls
def test_integrity_failure_takes_priority_over_missing_final_path(_isolate_state):
"""Integrity failure check must run BEFORE the missing-final-path
fallback. Both conditions are true (no final path AND integrity
failed); the failure wins."""
completion_calls = []
runtime = _build_runtime(completion_calls)
_seed_task('t4', 'b4')
context = {
'task_id': 't4',
'batch_id': 'b4',
'context_key': 'test::ctx4',
'_integrity_failure_msg': 'duration mismatch',
# no _final_processed_path — would otherwise hit "assume success"
}
with patch.object(import_pipeline, 'post_process_matched_download',
lambda *a, **kw: None):
import_pipeline.post_process_matched_download_with_verification(
'test::ctx4', context, '/fake/source.flac', 't4', 'b4', runtime,
)
assert runtime_state.download_tasks['t4']['status'] == 'failed'
# Critical: must NOT have notified success
assert ('b4', 't4', True) not in completion_calls

@ -3444,6 +3444,7 @@ const WHATS_NEW = {
'2.4.2': [
// --- post-2.4.1 dev work — entries hidden by _getLatestWhatsNewVersion until the build version bumps ---
{ date: 'Unreleased — 2.4.2 dev cycle' },
{ title: 'Fix: Tracks Showed Completed When File Was Quarantined', desc: 'caught downloading kendrick mr morale: three tracks (rich interlude, savior interlude, savior) showed ✅ completed in the modal but were missing on disk. two layered bugs. (1) the post-process verification wrapper had a fallback that assumed success when no `_final_processed_path` was in context — but integrity-rejected files (which get quarantined instead of moved) leave that path unset, so the wrapper marked them complete. now wrapper explicitly checks `_integrity_failure_msg` and `_race_guard_failed` markers before the assume-success fallback. failed integrity = task marked failed, batch tracker notified with success=false. (2) acoustid skip-logic was too lenient — when fingerprint confidence was very high and either title OR artist matched a bit, it skipped verification with reason "likely same song in different language/script." that fired for english-vs-english by the same artist with the word "interlude" in both — same artist + 0.55 title sim = skip = wrong file accepted. tightened: skip now requires non-ASCII chars present (real language/script case) AND artist match, OR very high title similarity (≥0.80) AND artist match. english-vs-english with very different titles by same artist no longer skipped — verification correctly returns FAIL and the wrong file gets quarantined.', page: 'downloads' },
{ title: 'Stop Navidrome From Splitting Albums Over Inconsistent MBIDs', desc: 'discord report (samuel [KC]): tracks of the same album sometimes carry different MUSICBRAINZ_ALBUMID tags, which causes navidrome to split the album into multiple entries. two-part fix: (1) the MBID Mismatch Detector now does a second scan that groups tracks by db album, finds the consensus (most-common) album mbid, and flags dissenters — fix action rewrites the dissenter\'s tag to match. catches existing inconsistencies in your library. (2) root cause: per-track musicbrainz release lookups went through an in-memory cache that\'s capped at 4096 entries and dies on server restart, so big libraries / restarts could resolve different release ids for tracks of the same album. added a persistent sqlite-backed cache so a release mbid resolved ONCE for an album applies to every future track of that album for the install\'s lifetime. strictly additive: any failure in the persistent layer falls through to the live musicbrainz lookup exactly as before.', page: 'library' },
{ title: 'Lidarr: Right Track Lands on Disk + Profile Lookup Stops Failing', desc: 'lidarr is an album-grabber — when you ask for one track it grabs the whole album, then we pick the wanted track out. old code blindly took the first imported file as the result, so any track you asked for got mistagged as track 1 of the album. now matches the wanted title against lidarr\'s track list (with punctuation-tolerant fuzzy compare) and copies only that file. also fixed a hardcoded `metadataProfileId=1` that broke artist-add on installs where someone had renamed/recreated profiles, and a polling-loop bug where the inner break never escaped the outer poll loop so completion detection was delayed. settings tooltip updated to be honest: lidarr is best for full-album grabs and effectively a no-op for playlist sync (track searches return nothing useful, hybrid mode falls through to your other sources).', page: 'settings' },
{ title: 'SoundCloud as a Download Source', desc: 'discord request (toasti): some tracks (DJ mixes, sets, removed-from-spotify exclusives) only live on soundcloud. soundcloud now plugs into the existing download-source picker on settings → downloads — pick "SoundCloud Only" or include it in the hybrid order alongside soulseek / youtube / tidal / qobuz / hifi / deezer / lidarr. anonymous-only (no account needed); quality is whatever soundcloud serves anonymously, typically 128 kbps mp3 or aac depending on the upload. soundcloud doesn\'t expose lossless to anyone, so don\'t expect flac. follows the exact same wiring contract as every other download source — search dispatch, hybrid fallback, queue / cancel / clear, sidebar source label, provenance + library history all work plug-and-play.', page: 'settings' },

Loading…
Cancel
Save