Merge pull request #449 from Nezreka/fix/duplicate-detector-mount-paths

Filter same-physical-file duplicates from duplicate detector
pull/450/head
BoulderBadgeDad 4 weeks ago committed by GitHub
commit 09591ad089
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -154,6 +154,18 @@ class DuplicateDetectorJob(RepairJob):
if ignore_cross_album and t1['album'] and t2['album'] and t1['album'] != t2['album']:
continue
# Skip pairs that are the same physical file mounted at
# different roots — e.g. /app/Transfer/... and /media/Music/...
# when the user binds the same host directory into both
# SoulSync and Plex containers. Both rows end up in the DB
# (one from SoulSync's local scan, one from Plex's sync),
# but they point at one file on disk.
if _is_same_physical_file(
t1['file_path'], t2['file_path'],
t1['duration'], t2['duration'],
):
continue
group.append(t2)
if len(group) >= 2:
@ -229,3 +241,43 @@ def _normalize(text: str) -> str:
t = text.lower()
t = re.sub(r'[^a-z0-9() ]', '', t)
return t.strip()
def _is_same_physical_file(p1, p2, dur1, dur2) -> bool:
"""Detect when two DB rows point at the same file mounted at different paths.
When a user binds the same host music directory into both SoulSync
(e.g. ``/app/Transfer``) and a media server like Plex (e.g.
``/media/Music``), the SoulSync scan and the media-server library
sync each create a track row pointing at the same physical file
via different mount paths. The two rows then look like a fuzzy-
match duplicate to this job.
Returns True when:
- Both paths share the last 3 segments (filename + album + artist
folder), so they really are the same release on disk;
- The leading mount-root segments differ, ruling out the case
where one row is just a re-scan of the other path; and
- When both rows carry a duration, the durations agree within 1
second (defensive different files at parallel paths would
almost always disagree on duration even slightly).
"""
if not p1 or not p2:
return False
norm1 = str(p1).replace('\\', '/').rstrip('/')
norm2 = str(p2).replace('\\', '/').rstrip('/')
parts1 = [x for x in norm1.split('/') if x]
parts2 = [x for x in norm2.split('/') if x]
if len(parts1) < 3 or len(parts2) < 3:
return False
tail1 = [s.lower() for s in parts1[-3:]]
tail2 = [s.lower() for s in parts2[-3:]]
if tail1 != tail2:
return False
# Confirm mount roots actually differ, otherwise we'd skip
# legitimate duplicates that happen to share the trailing path.
if parts1[:-3] == parts2[:-3]:
return False
if dur1 and dur2 and abs(dur1 - dur2) > 1.0:
return False
return True

@ -0,0 +1,82 @@
"""Regression tests for duplicate detector mount-path filter.
When a user binds the same host music directory into both SoulSync
and a media server (e.g. Plex at /media/Music, SoulSync at
/app/Transfer), the duplicate detector used to flag the two DB rows
that point at the same physical file as a duplicate group. The new
``_is_same_physical_file`` helper filters those pairs out.
"""
from core.repair_jobs.duplicate_detector import _is_same_physical_file
class TestIsSamePhysicalFile:
def test_same_file_at_different_mount_roots_is_filtered(self) -> None:
"""The reported scenario: SoulSync container and Plex container
bind the same host directory at different mount points."""
p1 = "/app/Transfer/The Smashing Pumpkins/MACHINA _ The Machines of God/15 - With Every Light.flac"
p2 = "/media/Music/The Smashing Pumpkins/MACHINA _ The Machines of God/15 - With Every Light.flac"
assert _is_same_physical_file(p1, p2, 235.0, 235.0)
def test_durations_within_one_second_pass(self) -> None:
"""Allow ±1 second slack — different metadata readers occasionally
round duration slightly differently."""
p1 = "/a/Artist/Album/track.flac"
p2 = "/b/Artist/Album/track.flac"
assert _is_same_physical_file(p1, p2, 120.5, 121.0)
def test_durations_more_than_one_second_apart_does_not_match(self) -> None:
"""Two files with the same name but actually different audio
content should NOT be filtered."""
p1 = "/a/Artist/Album/track.flac"
p2 = "/b/Artist/Album/track.flac"
assert not _is_same_physical_file(p1, p2, 120.0, 130.0)
def test_legit_duplicate_under_same_root_is_not_filtered(self) -> None:
"""Same Artist/Album/file under the same root means the rows
are actually duplicates of a re-download, not the same physical
file at different mounts. Detector should still flag those."""
p1 = "/app/Transfer/Artist/Album/track.flac"
p2 = "/app/Transfer/Artist/Album/track.flac"
assert not _is_same_physical_file(p1, p2, 200.0, 200.0)
def test_legit_duplicate_under_sibling_albums_is_not_filtered(self) -> None:
"""Two genuinely-duplicate downloads under different parent
directories should still be flagged as a duplicate group."""
p1 = "/app/Transfer/Artist/Album A/track.flac"
p2 = "/app/Transfer/Artist/Album B/track.flac"
# The trailing 3 segments differ (album folders), so the helper
# short-circuits and the pair stays in the duplicate group.
assert not _is_same_physical_file(p1, p2, 200.0, 200.0)
def test_paths_too_short_returns_false(self) -> None:
"""Defensive: don't filter when there isn't enough path context."""
assert not _is_same_physical_file("/a.flac", "/b.flac", 200.0, 200.0)
assert not _is_same_physical_file("/a/b.flac", "/c/b.flac", 200.0, 200.0)
def test_missing_paths_returns_false(self) -> None:
assert not _is_same_physical_file(None, "/a/b/c.flac", 200.0, 200.0)
assert not _is_same_physical_file("", "/a/b/c.flac", 200.0, 200.0)
assert not _is_same_physical_file("/a/b/c.flac", None, 200.0, 200.0)
def test_missing_durations_still_filters_when_paths_match(self) -> None:
"""If duration data is unavailable, fall back to path-only match
because the path tail equality is itself a strong signal."""
p1 = "/app/Transfer/Artist/Album/track.flac"
p2 = "/media/Music/Artist/Album/track.flac"
assert _is_same_physical_file(p1, p2, None, None)
assert _is_same_physical_file(p1, p2, 200.0, None)
assert _is_same_physical_file(p1, p2, None, 200.0)
def test_windows_style_paths_normalize(self) -> None:
"""Mixed-separator paths from Windows hosts should still match."""
p1 = "C:\\Music\\Artist\\Album\\track.flac"
p2 = "/media/Music/Artist/Album/track.flac"
assert _is_same_physical_file(p1, p2, 200.0, 200.0)
def test_case_insensitive_match(self) -> None:
"""Filesystems vary in case sensitivity; treat tail comparison
as case-insensitive so case differences don't defeat the filter."""
p1 = "/app/Transfer/The Smashing Pumpkins/MACHINA/15 - With Every Light.flac"
p2 = "/media/Music/the smashing pumpkins/machina/15 - with every light.flac"
assert _is_same_physical_file(p1, p2, 200.0, 200.0)
Loading…
Cancel
Save