diff --git a/core/repair_jobs/duplicate_detector.py b/core/repair_jobs/duplicate_detector.py index 2f1e3045..2b901b25 100644 --- a/core/repair_jobs/duplicate_detector.py +++ b/core/repair_jobs/duplicate_detector.py @@ -154,6 +154,18 @@ class DuplicateDetectorJob(RepairJob): if ignore_cross_album and t1['album'] and t2['album'] and t1['album'] != t2['album']: continue + # Skip pairs that are the same physical file mounted at + # different roots — e.g. /app/Transfer/... and /media/Music/... + # when the user binds the same host directory into both + # SoulSync and Plex containers. Both rows end up in the DB + # (one from SoulSync's local scan, one from Plex's sync), + # but they point at one file on disk. + if _is_same_physical_file( + t1['file_path'], t2['file_path'], + t1['duration'], t2['duration'], + ): + continue + group.append(t2) if len(group) >= 2: @@ -229,3 +241,43 @@ def _normalize(text: str) -> str: t = text.lower() t = re.sub(r'[^a-z0-9() ]', '', t) return t.strip() + + +def _is_same_physical_file(p1, p2, dur1, dur2) -> bool: + """Detect when two DB rows point at the same file mounted at different paths. + + When a user binds the same host music directory into both SoulSync + (e.g. ``/app/Transfer``) and a media server like Plex (e.g. + ``/media/Music``), the SoulSync scan and the media-server library + sync each create a track row pointing at the same physical file + via different mount paths. The two rows then look like a fuzzy- + match duplicate to this job. + + Returns True when: + - Both paths share the last 3 segments (filename + album + artist + folder), so they really are the same release on disk; + - The leading mount-root segments differ, ruling out the case + where one row is just a re-scan of the other path; and + - When both rows carry a duration, the durations agree within 1 + second (defensive — different files at parallel paths would + almost always disagree on duration even slightly). + """ + if not p1 or not p2: + return False + norm1 = str(p1).replace('\\', '/').rstrip('/') + norm2 = str(p2).replace('\\', '/').rstrip('/') + parts1 = [x for x in norm1.split('/') if x] + parts2 = [x for x in norm2.split('/') if x] + if len(parts1) < 3 or len(parts2) < 3: + return False + tail1 = [s.lower() for s in parts1[-3:]] + tail2 = [s.lower() for s in parts2[-3:]] + if tail1 != tail2: + return False + # Confirm mount roots actually differ, otherwise we'd skip + # legitimate duplicates that happen to share the trailing path. + if parts1[:-3] == parts2[:-3]: + return False + if dur1 and dur2 and abs(dur1 - dur2) > 1.0: + return False + return True diff --git a/tests/test_duplicate_detector_mount_paths.py b/tests/test_duplicate_detector_mount_paths.py new file mode 100644 index 00000000..937095cf --- /dev/null +++ b/tests/test_duplicate_detector_mount_paths.py @@ -0,0 +1,82 @@ +"""Regression tests for duplicate detector mount-path filter. + +When a user binds the same host music directory into both SoulSync +and a media server (e.g. Plex at /media/Music, SoulSync at +/app/Transfer), the duplicate detector used to flag the two DB rows +that point at the same physical file as a duplicate group. The new +``_is_same_physical_file`` helper filters those pairs out. +""" + +from core.repair_jobs.duplicate_detector import _is_same_physical_file + + +class TestIsSamePhysicalFile: + def test_same_file_at_different_mount_roots_is_filtered(self) -> None: + """The reported scenario: SoulSync container and Plex container + bind the same host directory at different mount points.""" + p1 = "/app/Transfer/The Smashing Pumpkins/MACHINA _ The Machines of God/15 - With Every Light.flac" + p2 = "/media/Music/The Smashing Pumpkins/MACHINA _ The Machines of God/15 - With Every Light.flac" + assert _is_same_physical_file(p1, p2, 235.0, 235.0) + + def test_durations_within_one_second_pass(self) -> None: + """Allow ±1 second slack — different metadata readers occasionally + round duration slightly differently.""" + p1 = "/a/Artist/Album/track.flac" + p2 = "/b/Artist/Album/track.flac" + assert _is_same_physical_file(p1, p2, 120.5, 121.0) + + def test_durations_more_than_one_second_apart_does_not_match(self) -> None: + """Two files with the same name but actually different audio + content should NOT be filtered.""" + p1 = "/a/Artist/Album/track.flac" + p2 = "/b/Artist/Album/track.flac" + assert not _is_same_physical_file(p1, p2, 120.0, 130.0) + + def test_legit_duplicate_under_same_root_is_not_filtered(self) -> None: + """Same Artist/Album/file under the same root means the rows + are actually duplicates of a re-download, not the same physical + file at different mounts. Detector should still flag those.""" + p1 = "/app/Transfer/Artist/Album/track.flac" + p2 = "/app/Transfer/Artist/Album/track.flac" + assert not _is_same_physical_file(p1, p2, 200.0, 200.0) + + def test_legit_duplicate_under_sibling_albums_is_not_filtered(self) -> None: + """Two genuinely-duplicate downloads under different parent + directories should still be flagged as a duplicate group.""" + p1 = "/app/Transfer/Artist/Album A/track.flac" + p2 = "/app/Transfer/Artist/Album B/track.flac" + # The trailing 3 segments differ (album folders), so the helper + # short-circuits and the pair stays in the duplicate group. + assert not _is_same_physical_file(p1, p2, 200.0, 200.0) + + def test_paths_too_short_returns_false(self) -> None: + """Defensive: don't filter when there isn't enough path context.""" + assert not _is_same_physical_file("/a.flac", "/b.flac", 200.0, 200.0) + assert not _is_same_physical_file("/a/b.flac", "/c/b.flac", 200.0, 200.0) + + def test_missing_paths_returns_false(self) -> None: + assert not _is_same_physical_file(None, "/a/b/c.flac", 200.0, 200.0) + assert not _is_same_physical_file("", "/a/b/c.flac", 200.0, 200.0) + assert not _is_same_physical_file("/a/b/c.flac", None, 200.0, 200.0) + + def test_missing_durations_still_filters_when_paths_match(self) -> None: + """If duration data is unavailable, fall back to path-only match + because the path tail equality is itself a strong signal.""" + p1 = "/app/Transfer/Artist/Album/track.flac" + p2 = "/media/Music/Artist/Album/track.flac" + assert _is_same_physical_file(p1, p2, None, None) + assert _is_same_physical_file(p1, p2, 200.0, None) + assert _is_same_physical_file(p1, p2, None, 200.0) + + def test_windows_style_paths_normalize(self) -> None: + """Mixed-separator paths from Windows hosts should still match.""" + p1 = "C:\\Music\\Artist\\Album\\track.flac" + p2 = "/media/Music/Artist/Album/track.flac" + assert _is_same_physical_file(p1, p2, 200.0, 200.0) + + def test_case_insensitive_match(self) -> None: + """Filesystems vary in case sensitivity; treat tail comparison + as case-insensitive so case differences don't defeat the filter.""" + p1 = "/app/Transfer/The Smashing Pumpkins/MACHINA/15 - With Every Light.flac" + p2 = "/media/Music/the smashing pumpkins/machina/15 - with every light.flac" + assert _is_same_physical_file(p1, p2, 200.0, 200.0)