From c3a3510c75252f9be7b62b01dd03d0f4da789d90 Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Sun, 5 Apr 2026 00:27:55 -0700 Subject: [PATCH] Add Live/Commentary Cleaner library maintenance job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New repair job that scans track and album titles for live performances, commentary, interviews, skits, and spoken word content. Creates findings for user review — no auto-fix. Configurable per content type (live, commentary, interviews, spoken word), with optional album title scanning and tracks/albums scope toggle. Fix action removes track from DB + deletes file, cleans up empty albums and directories. Follows existing repair job pattern exactly. --- core/repair_jobs/__init__.py | 1 + core/repair_jobs/live_commentary_cleaner.py | 236 ++++++++++++++++++++ core/repair_worker.py | 68 ++++++ 3 files changed, 305 insertions(+) create mode 100644 core/repair_jobs/live_commentary_cleaner.py diff --git a/core/repair_jobs/__init__.py b/core/repair_jobs/__init__.py index 49cbe82f..4760e35d 100644 --- a/core/repair_jobs/__init__.py +++ b/core/repair_jobs/__init__.py @@ -41,6 +41,7 @@ _JOB_MODULES = [ 'core.repair_jobs.single_album_dedup', 'core.repair_jobs.lossy_converter', 'core.repair_jobs.album_tag_consistency', + 'core.repair_jobs.live_commentary_cleaner', ] diff --git a/core/repair_jobs/live_commentary_cleaner.py b/core/repair_jobs/live_commentary_cleaner.py new file mode 100644 index 00000000..55b99d81 --- /dev/null +++ b/core/repair_jobs/live_commentary_cleaner.py @@ -0,0 +1,236 @@ +"""Live/Commentary Cleaner Job — finds live, commentary, and interview content in the library.""" + +import re +from collections import defaultdict + +from core.repair_jobs import register_job +from core.repair_jobs.base import JobContext, JobResult, RepairJob +from utils.logging_config import get_logger + +logger = get_logger("repair_job.live_commentary_cleaner") + +# Keywords that indicate unwanted content types +# Each tuple: (keyword, content_type_label) +_CONTENT_PATTERNS = [ + # Live + (r'\blive\b', 'live'), + (r'\blive at\b', 'live'), + (r'\blive from\b', 'live'), + (r'\blive in\b', 'live'), + (r'\bin concert\b', 'live'), + (r'\bunplugged\b', 'live'), + # Commentary + (r'\bcommentary\b', 'commentary'), + (r'\bcommented\b', 'commentary'), + (r'\btrack.?by.?track\b', 'commentary'), + # Interview + (r'\binterview\b', 'interview'), + (r'\binterlude\b', 'interview'), + (r'\bskit\b', 'interview'), + # Spoken word + (r'\bspoken\s*word\b', 'spoken_word'), + (r'\bnarrat(?:ion|ed)\b', 'spoken_word'), + (r'\bintroduction\b', 'spoken_word'), +] + + +def _detect_content_type(title, album_title=''): + """Check title and album for unwanted content keywords. Returns content_type or None.""" + combined = f"{title} {album_title}".lower() + for pattern, content_type in _CONTENT_PATTERNS: + if re.search(pattern, combined): + return content_type + return None + + +def _format_type(content_type): + """Format content type for display.""" + return { + 'live': 'Live', + 'commentary': 'Commentary', + 'interview': 'Interview/Skit', + 'spoken_word': 'Spoken Word', + }.get(content_type, content_type.title()) + + +@register_job +class LiveCommentaryCleanerJob(RepairJob): + job_id = 'live_commentary_cleaner' + display_name = 'Live/Commentary Cleaner' + description = 'Finds live performances, commentary, interviews, and spoken word content' + help_text = ( + 'Scans your library for tracks and albums that contain live performances, ' + 'commentary tracks, interviews, skits, or spoken word content based on ' + 'title keywords.\n\n' + 'You can configure which content types to flag using the settings below. ' + 'Each finding shows the track, its content type, and the matched keyword.\n\n' + 'Fix action: removes the track from the database and deletes the file. ' + 'If all tracks in an album are removed, the empty album is also cleaned up.\n\n' + 'Settings:\n' + '- Flag Live: Flag live performances and concert recordings\n' + '- Flag Commentary: Flag commentary and track-by-track content\n' + '- Flag Interviews: Flag interviews, skits, and interludes\n' + '- Flag Spoken Word: Flag spoken word, narration, and introductions\n' + '- Scan Album Titles: Also check album titles (catches "Live at Wembley" albums)\n' + '- Scope: "tracks" flags individual tracks, "albums" flags entire albums with matching titles' + ) + icon = 'repair-icon-filter' + default_enabled = False + default_interval_hours = 168 + default_settings = { + 'flag_live': True, + 'flag_commentary': True, + 'flag_interviews': True, + 'flag_spoken_word': True, + 'scan_album_titles': True, + 'scope': 'tracks', # 'tracks' or 'albums' + } + auto_fix = False + + def scan(self, context: JobContext) -> JobResult: + result = JobResult() + + settings = self._get_settings(context) + enabled_types = set() + if settings.get('flag_live', True): + enabled_types.add('live') + if settings.get('flag_commentary', True): + enabled_types.add('commentary') + if settings.get('flag_interviews', True): + enabled_types.add('interview') + if settings.get('flag_spoken_word', True): + enabled_types.add('spoken_word') + + if not enabled_types: + return result + + scan_album_titles = settings.get('scan_album_titles', True) + scope = settings.get('scope', 'tracks') + + conn = None + try: + conn = context.db._get_connection() + cursor = conn.cursor() + cursor.execute(""" + SELECT t.id, t.title, ar.name, al.title, al.id, al.record_type, + t.file_path, t.bitrate, t.duration, t.track_number, + al.thumb_url, ar.thumb_url + FROM tracks t + LEFT JOIN artists ar ON ar.id = t.artist_id + LEFT JOIN albums al ON al.id = t.album_id + WHERE t.title IS NOT NULL AND t.title != '' + AND t.file_path IS NOT NULL AND t.file_path != '' + """) + tracks = cursor.fetchall() + except Exception as e: + logger.error("Error fetching tracks: %s", e, exc_info=True) + result.errors += 1 + return result + finally: + if conn: + conn.close() + + if not tracks: + return result + + total = len(tracks) + if context.update_progress: + context.update_progress(0, total) + if context.report_progress: + context.report_progress(phase=f'Scanning {total} tracks...', total=total) + + # Track which albums we've already flagged (for album scope) + flagged_album_ids = set() + + for idx, row in enumerate(tracks): + if context.check_stop(): + return result + + result.scanned += 1 + + if idx % 200 == 0: + if context.report_progress: + context.report_progress( + scanned=idx, total=total, + phase=f'Scanning {idx} / {total}', + log_line=f'Checking: {row[1]}', + log_type='info' + ) + if context.update_progress: + context.update_progress(idx, total) + + (track_id, title, artist_name, album_title, album_id, + album_type, file_path, bitrate, duration, track_number, + album_thumb, artist_thumb) = row + + # Check track title + content_type = _detect_content_type(title, '') + + # Check album title if enabled and track title didn't match + album_matched = False + if not content_type and scan_album_titles and album_title: + content_type = _detect_content_type('', album_title) + if content_type: + album_matched = True + + if not content_type: + continue + + # Skip if this content type isn't enabled + if content_type not in enabled_types: + continue + + # Album scope: flag once per album, not per track + if scope == 'albums' and album_matched and album_id: + if album_id in flagged_album_ids: + continue + flagged_album_ids.add(album_id) + + type_label = _format_type(content_type) + match_source = f'album "{album_title}"' if album_matched else f'track "{title}"' + + if context.create_finding: + try: + context.create_finding( + job_id=self.job_id, + finding_type='unwanted_content', + severity='info', + entity_type='album' if (scope == 'albums' and album_matched) else 'track', + entity_id=str(album_id if (scope == 'albums' and album_matched) else track_id), + file_path=file_path, + title=f'{type_label}: {title} by {artist_name or "Unknown"}', + description=( + f'{type_label} content detected in {match_source}. ' + f'Album: "{album_title or "Unknown"}" ({album_type or "unknown"} type).' + ), + details={ + 'track': { + 'id': track_id, + 'title': title, + 'artist': artist_name or '', + 'album': album_title or '', + 'album_id': album_id, + 'album_type': album_type or '', + 'file_path': file_path, + 'bitrate': bitrate, + 'duration': duration, + 'track_number': track_number, + }, + 'content_type': content_type, + 'type_label': type_label, + 'album_matched': album_matched, + 'album_thumb_url': album_thumb or None, + 'artist_thumb_url': artist_thumb or None, + } + ) + result.findings_created += 1 + except Exception as e: + logger.debug("Error creating finding: %s", e) + result.errors += 1 + + if context.update_progress: + context.update_progress(total, total) + + logger.info("Live/Commentary cleaner: scanned %d tracks, found %d", + result.scanned, result.findings_created) + return result diff --git a/core/repair_worker.py b/core/repair_worker.py index ecc616ae..fd3520ea 100644 --- a/core/repair_worker.py +++ b/core/repair_worker.py @@ -815,6 +815,7 @@ class RepairWorker: 'incomplete_album': self._fix_incomplete_album, 'path_mismatch': self._fix_path_mismatch, 'missing_lossy_copy': self._fix_missing_lossy_copy, + 'unwanted_content': self._fix_unwanted_content, } handler = handlers.get(finding_type) if not handler: @@ -1278,6 +1279,73 @@ class RepairWorker: msg += ' (file deleted)' return {'success': True, 'action': 'removed_single', 'message': msg} + def _fix_unwanted_content(self, entity_type, entity_id, file_path, details): + """Remove unwanted content (live, commentary, interview, spoken word) from library.""" + track_info = details.get('track', {}) + track_id = track_info.get('id') or entity_id + track_path = track_info.get('file_path') or file_path + type_label = details.get('type_label', 'Unwanted') + + if not track_id: + return {'success': False, 'error': 'No track ID to remove'} + + # Remove from DB + conn = None + album_id = track_info.get('album_id') + try: + conn = self.db._get_connection() + cursor = conn.cursor() + cursor.execute("DELETE FROM tracks WHERE id = ?", (track_id,)) + conn.commit() + removed = cursor.rowcount + + # Check if album is now empty — clean it up too + if removed and album_id: + cursor.execute("SELECT COUNT(*) FROM tracks WHERE album_id = ?", (album_id,)) + remaining = cursor.fetchone()[0] + if remaining == 0: + cursor.execute("DELETE FROM albums WHERE id = ?", (album_id,)) + conn.commit() + logger.info("Cleaned up empty album (id=%s) after removing last track", album_id) + except Exception as e: + return {'success': False, 'error': f'DB error: {e}'} + finally: + if conn: + conn.close() + + if removed == 0: + return {'success': True, 'action': 'already_removed', 'message': 'Track was already removed'} + + # Delete file from disk + file_deleted = False + if track_path: + download_folder = None + if self._config_manager: + download_folder = self._config_manager.get('soulseek.download_path', '') + try: + resolved = _resolve_file_path(track_path, self.transfer_folder, download_folder) + if resolved and os.path.exists(resolved): + os.remove(resolved) + file_deleted = True + # Clean up empty parent directories + transfer_norm = os.path.normpath(self.transfer_folder) + parent = os.path.dirname(resolved) + for _ in range(3): + if (parent and os.path.isdir(parent) + and os.path.normpath(parent) != transfer_norm + and not os.listdir(parent)): + os.rmdir(parent) + parent = os.path.dirname(parent) + else: + break + except OSError: + pass # Best effort — DB entry already removed + + msg = f'{type_label} track removed from library' + if file_deleted: + msg += ' (file deleted)' + return {'success': True, 'action': 'removed_content', 'message': msg} + def _fix_mbid_mismatch(self, entity_type, entity_id, file_path, details): """Remove the mismatched MusicBrainz recording ID from the audio file.""" if not file_path: