Merge pull request #431 from Nezreka/refactor/lift-duplicate-cleaner

Lift _run_duplicate_cleaner to core/library/duplicate_cleaner.py
pull/432/head
BoulderBadgeDad 3 weeks ago committed by GitHub
commit 167718d694
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,223 @@
"""Duplicate cleaner — lifted from web_server.py.
The function body is byte-identical to the original. Module-level
state and helpers are injected via init() because the duplicate
cleaner state dict, lock, automation engine, and docker_resolve_path
helper all live in web_server.py.
"""
import logging
from config.settings import config_manager
from core.runtime_state import add_activity_item
logger = logging.getLogger(__name__)
# Injected at runtime via init().
duplicate_cleaner_state = None
duplicate_cleaner_lock = None
docker_resolve_path = None
automation_engine = None
def init(state, lock, resolve_path_fn, engine):
"""Bind shared state/helpers from web_server."""
global duplicate_cleaner_state, duplicate_cleaner_lock
global docker_resolve_path, automation_engine
duplicate_cleaner_state = state
duplicate_cleaner_lock = lock
docker_resolve_path = resolve_path_fn
automation_engine = engine
def _run_duplicate_cleaner():
"""Main duplicate cleaner worker function - scans Transfer folder for duplicate files"""
import os
import shutil
from collections import defaultdict
from pathlib import Path
try:
with duplicate_cleaner_lock:
duplicate_cleaner_state["status"] = "running"
duplicate_cleaner_state["phase"] = "Initializing scan..."
duplicate_cleaner_state["progress"] = 0
duplicate_cleaner_state["files_scanned"] = 0
duplicate_cleaner_state["total_files"] = 0
duplicate_cleaner_state["duplicates_found"] = 0
duplicate_cleaner_state["deleted"] = 0
duplicate_cleaner_state["space_freed"] = 0
duplicate_cleaner_state["error_message"] = ""
logger.warning("[Duplicate Cleaner] Starting duplicate scan...")
# Get Transfer folder path from config
transfer_folder = docker_resolve_path(config_manager.get('soulseek.transfer_path', './Transfer'))
if not transfer_folder or not os.path.exists(transfer_folder):
with duplicate_cleaner_lock:
duplicate_cleaner_state["status"] = "error"
duplicate_cleaner_state["phase"] = "Output folder not configured or does not exist"
duplicate_cleaner_state["error_message"] = "Please configure output folder in settings"
logger.warning(f"[Duplicate Cleaner] Transfer folder not found: {transfer_folder}")
return
# Create deleted folder if it doesn't exist
deleted_folder = os.path.join(transfer_folder, 'deleted')
os.makedirs(deleted_folder, exist_ok=True)
logger.warning(f"[Duplicate Cleaner] Deleted folder: {deleted_folder}")
# Phase 1: Count total files for progress tracking
with duplicate_cleaner_lock:
duplicate_cleaner_state["phase"] = "Counting files..."
total_files = 0
for _root, dirs, files in os.walk(transfer_folder):
# Skip the deleted folder itself
if 'deleted' in dirs:
dirs.remove('deleted')
total_files += len(files)
logger.warning(f"[Duplicate Cleaner] Found {total_files} total files to scan")
with duplicate_cleaner_lock:
duplicate_cleaner_state["total_files"] = total_files
duplicate_cleaner_state["phase"] = f"Scanning {total_files} files..."
# Phase 2: Scan and group files by directory and filename
# Structure: {directory_path: {filename_without_ext: [full_file_paths]}}
files_by_dir_and_name = defaultdict(lambda: defaultdict(list))
files_scanned = 0
# Audio file extensions to consider
audio_extensions = {'.flac', '.mp3', '.m4a', '.aac', '.opus', '.ogg', '.wav', '.ape', '.wma', '.alac', '.aiff', '.aif', '.dsf', '.dff'}
for root, dirs, files in os.walk(transfer_folder):
# Skip the deleted folder
if 'deleted' in dirs:
dirs.remove('deleted')
for file in files:
files_scanned += 1
# Update progress
with duplicate_cleaner_lock:
duplicate_cleaner_state["files_scanned"] = files_scanned
duplicate_cleaner_state["progress"] = (files_scanned / total_files) * 100 if total_files > 0 else 0
duplicate_cleaner_state["phase"] = f"Scanning: {file}"
# Get file extension
file_path = os.path.join(root, file)
file_name, file_ext = os.path.splitext(file)
file_ext_lower = file_ext.lower()
# Only process audio files
if file_ext_lower not in audio_extensions:
continue
# Group by directory and filename (without extension)
files_by_dir_and_name[root][file_name].append({
'full_path': file_path,
'extension': file_ext_lower,
'size': os.path.getsize(file_path)
})
# Phase 3: Process duplicates
with duplicate_cleaner_lock:
duplicate_cleaner_state["phase"] = "Processing duplicates..."
# Quality priority: FLAC > OPUS/OGG > M4A/AAC > MP3/WMA
format_priority = {
'.flac': 1, '.ape': 1, '.wav': 1, '.alac': 1, '.aiff': 1, '.aif': 1, '.dsf': 1, '.dff': 1, # Lossless
'.opus': 2, '.ogg': 2, # High quality lossy
'.m4a': 3, '.aac': 3, # Standard lossy
'.mp3': 4, '.wma': 4 # Lower quality lossy
}
duplicates_found = 0
deleted_count = 0
space_freed = 0
for directory, files_by_name in files_by_dir_and_name.items():
for filename, file_versions in files_by_name.items():
# Only process if we have duplicates (more than one version)
if len(file_versions) <= 1:
continue
duplicates_found += len(file_versions) - 1 # Count all but the one we keep
logger.warning(f"[Duplicate Cleaner] Found {len(file_versions)} versions of '{filename}' in {directory}")
# Sort by priority: best format first, then largest size
def sort_key(f):
priority = format_priority.get(f['extension'], 999)
size = f['size']
return (priority, -size) # Negative size for descending order
sorted_versions = sorted(file_versions, key=sort_key)
# Keep the first one (best quality), delete the rest
best_version = sorted_versions[0]
logger.warning(f"[Duplicate Cleaner] Keeping: {os.path.basename(best_version['full_path'])} "
f"({best_version['extension']}, {best_version['size']} bytes)")
for duplicate_file in sorted_versions[1:]:
try:
# Move to deleted folder with relative path preserved
relative_path = os.path.relpath(duplicate_file['full_path'], transfer_folder)
deleted_path = os.path.join(deleted_folder, relative_path)
# Create subdirectories in deleted folder if needed
os.makedirs(os.path.dirname(deleted_path), exist_ok=True)
# Move the file
shutil.move(duplicate_file['full_path'], deleted_path)
# Track stats
deleted_count += 1
space_freed += duplicate_file['size']
logger.warning(f"[Duplicate Cleaner] Moved to deleted: {os.path.basename(duplicate_file['full_path'])} "
f"({duplicate_file['extension']}, {duplicate_file['size']} bytes)")
# Update stats
with duplicate_cleaner_lock:
duplicate_cleaner_state["deleted"] = deleted_count
duplicate_cleaner_state["space_freed"] = space_freed
duplicate_cleaner_state["duplicates_found"] = duplicates_found
except Exception as e:
logger.error(f"[Duplicate Cleaner] Error moving file {duplicate_file['full_path']}: {e}")
continue
# Scan complete
with duplicate_cleaner_lock:
duplicate_cleaner_state["status"] = "finished"
duplicate_cleaner_state["progress"] = 100
duplicate_cleaner_state["phase"] = "Cleaning complete"
space_mb = space_freed / (1024 * 1024)
logger.warning(f"[Duplicate Cleaner] Scan complete: {files_scanned} files scanned, "
f"{duplicates_found} duplicates found, {deleted_count} files moved to deleted folder, "
f"{space_mb:.2f} MB freed")
# Add activity
add_activity_item("", "Duplicate Cleaner Complete",
f"{deleted_count} files removed, {space_mb:.1f} MB freed", "Now")
try:
if automation_engine:
automation_engine.emit('duplicate_scan_completed', {
'files_scanned': str(files_scanned),
'duplicates_found': str(duplicates_found),
'space_freed': f"{space_mb:.1f} MB",
})
except Exception:
pass
except Exception as e:
logger.error(f"[Duplicate Cleaner] Critical error: {e}")
import traceback
traceback.print_exc()
with duplicate_cleaner_lock:
duplicate_cleaner_state["status"] = "error"
duplicate_cleaner_state["error_message"] = str(e)
duplicate_cleaner_state["phase"] = f"Error: {str(e)}"

@ -17416,198 +17416,16 @@ def _run_quality_scanner(scope='watchlist', profile_id=1):
)
def _run_duplicate_cleaner():
"""Main duplicate cleaner worker function - scans Transfer folder for duplicate files"""
import os
import shutil
from collections import defaultdict
from pathlib import Path
try:
with duplicate_cleaner_lock:
duplicate_cleaner_state["status"] = "running"
duplicate_cleaner_state["phase"] = "Initializing scan..."
duplicate_cleaner_state["progress"] = 0
duplicate_cleaner_state["files_scanned"] = 0
duplicate_cleaner_state["total_files"] = 0
duplicate_cleaner_state["duplicates_found"] = 0
duplicate_cleaner_state["deleted"] = 0
duplicate_cleaner_state["space_freed"] = 0
duplicate_cleaner_state["error_message"] = ""
logger.warning("[Duplicate Cleaner] Starting duplicate scan...")
# Get Transfer folder path from config
transfer_folder = docker_resolve_path(config_manager.get('soulseek.transfer_path', './Transfer'))
if not transfer_folder or not os.path.exists(transfer_folder):
with duplicate_cleaner_lock:
duplicate_cleaner_state["status"] = "error"
duplicate_cleaner_state["phase"] = "Output folder not configured or does not exist"
duplicate_cleaner_state["error_message"] = "Please configure output folder in settings"
logger.warning(f"[Duplicate Cleaner] Transfer folder not found: {transfer_folder}")
return
# Create deleted folder if it doesn't exist
deleted_folder = os.path.join(transfer_folder, 'deleted')
os.makedirs(deleted_folder, exist_ok=True)
logger.warning(f"[Duplicate Cleaner] Deleted folder: {deleted_folder}")
# Phase 1: Count total files for progress tracking
with duplicate_cleaner_lock:
duplicate_cleaner_state["phase"] = "Counting files..."
total_files = 0
for _root, dirs, files in os.walk(transfer_folder):
# Skip the deleted folder itself
if 'deleted' in dirs:
dirs.remove('deleted')
total_files += len(files)
logger.warning(f"[Duplicate Cleaner] Found {total_files} total files to scan")
with duplicate_cleaner_lock:
duplicate_cleaner_state["total_files"] = total_files
duplicate_cleaner_state["phase"] = f"Scanning {total_files} files..."
# Phase 2: Scan and group files by directory and filename
# Structure: {directory_path: {filename_without_ext: [full_file_paths]}}
files_by_dir_and_name = defaultdict(lambda: defaultdict(list))
files_scanned = 0
# Audio file extensions to consider
audio_extensions = {'.flac', '.mp3', '.m4a', '.aac', '.opus', '.ogg', '.wav', '.ape', '.wma', '.alac', '.aiff', '.aif', '.dsf', '.dff'}
for root, dirs, files in os.walk(transfer_folder):
# Skip the deleted folder
if 'deleted' in dirs:
dirs.remove('deleted')
for file in files:
files_scanned += 1
# Update progress
with duplicate_cleaner_lock:
duplicate_cleaner_state["files_scanned"] = files_scanned
duplicate_cleaner_state["progress"] = (files_scanned / total_files) * 100 if total_files > 0 else 0
duplicate_cleaner_state["phase"] = f"Scanning: {file}"
# Get file extension
file_path = os.path.join(root, file)
file_name, file_ext = os.path.splitext(file)
file_ext_lower = file_ext.lower()
# Only process audio files
if file_ext_lower not in audio_extensions:
continue
# Group by directory and filename (without extension)
files_by_dir_and_name[root][file_name].append({
'full_path': file_path,
'extension': file_ext_lower,
'size': os.path.getsize(file_path)
})
# Phase 3: Process duplicates
with duplicate_cleaner_lock:
duplicate_cleaner_state["phase"] = "Processing duplicates..."
# Quality priority: FLAC > OPUS/OGG > M4A/AAC > MP3/WMA
format_priority = {
'.flac': 1, '.ape': 1, '.wav': 1, '.alac': 1, '.aiff': 1, '.aif': 1, '.dsf': 1, '.dff': 1, # Lossless
'.opus': 2, '.ogg': 2, # High quality lossy
'.m4a': 3, '.aac': 3, # Standard lossy
'.mp3': 4, '.wma': 4 # Lower quality lossy
}
duplicates_found = 0
deleted_count = 0
space_freed = 0
for directory, files_by_name in files_by_dir_and_name.items():
for filename, file_versions in files_by_name.items():
# Only process if we have duplicates (more than one version)
if len(file_versions) <= 1:
continue
duplicates_found += len(file_versions) - 1 # Count all but the one we keep
logger.warning(f"[Duplicate Cleaner] Found {len(file_versions)} versions of '{filename}' in {directory}")
# Sort by priority: best format first, then largest size
def sort_key(f):
priority = format_priority.get(f['extension'], 999)
size = f['size']
return (priority, -size) # Negative size for descending order
sorted_versions = sorted(file_versions, key=sort_key)
# Keep the first one (best quality), delete the rest
best_version = sorted_versions[0]
logger.warning(f"[Duplicate Cleaner] Keeping: {os.path.basename(best_version['full_path'])} "
f"({best_version['extension']}, {best_version['size']} bytes)")
for duplicate_file in sorted_versions[1:]:
try:
# Move to deleted folder with relative path preserved
relative_path = os.path.relpath(duplicate_file['full_path'], transfer_folder)
deleted_path = os.path.join(deleted_folder, relative_path)
# Create subdirectories in deleted folder if needed
os.makedirs(os.path.dirname(deleted_path), exist_ok=True)
# Move the file
shutil.move(duplicate_file['full_path'], deleted_path)
# Track stats
deleted_count += 1
space_freed += duplicate_file['size']
logger.warning(f"[Duplicate Cleaner] Moved to deleted: {os.path.basename(duplicate_file['full_path'])} "
f"({duplicate_file['extension']}, {duplicate_file['size']} bytes)")
# Update stats
with duplicate_cleaner_lock:
duplicate_cleaner_state["deleted"] = deleted_count
duplicate_cleaner_state["space_freed"] = space_freed
duplicate_cleaner_state["duplicates_found"] = duplicates_found
except Exception as e:
logger.error(f"[Duplicate Cleaner] Error moving file {duplicate_file['full_path']}: {e}")
continue
# Scan complete
with duplicate_cleaner_lock:
duplicate_cleaner_state["status"] = "finished"
duplicate_cleaner_state["progress"] = 100
duplicate_cleaner_state["phase"] = "Cleaning complete"
space_mb = space_freed / (1024 * 1024)
logger.warning(f"[Duplicate Cleaner] Scan complete: {files_scanned} files scanned, "
f"{duplicates_found} duplicates found, {deleted_count} files moved to deleted folder, "
f"{space_mb:.2f} MB freed")
# Add activity
add_activity_item("", "Duplicate Cleaner Complete",
f"{deleted_count} files removed, {space_mb:.1f} MB freed", "Now")
try:
if automation_engine:
automation_engine.emit('duplicate_scan_completed', {
'files_scanned': str(files_scanned),
'duplicates_found': str(duplicates_found),
'space_freed': f"{space_mb:.1f} MB",
})
except Exception:
pass
except Exception as e:
logger.error(f"[Duplicate Cleaner] Critical error: {e}")
import traceback
traceback.print_exc()
with duplicate_cleaner_lock:
duplicate_cleaner_state["status"] = "error"
duplicate_cleaner_state["error_message"] = str(e)
duplicate_cleaner_state["phase"] = f"Error: {str(e)}"
from core.library.duplicate_cleaner import (
_run_duplicate_cleaner,
init as _init_duplicate_cleaner,
)
_init_duplicate_cleaner(
state=duplicate_cleaner_state,
lock=duplicate_cleaner_lock,
resolve_path_fn=docker_resolve_path,
engine=automation_engine,
)
@app.route('/api/quality-scanner/start', methods=['POST'])
def start_quality_scan():

Loading…
Cancel
Save