Add Live/Commentary Cleaner library maintenance job

New repair job that scans track and album titles for live performances,
commentary, interviews, skits, and spoken word content. Creates findings
for user review — no auto-fix.

Configurable per content type (live, commentary, interviews, spoken word),
with optional album title scanning and tracks/albums scope toggle.
Fix action removes track from DB + deletes file, cleans up empty albums
and directories. Follows existing repair job pattern exactly.
pull/253/head
Broque Thomas 2 weeks ago
parent 32cc7cbd5e
commit c3a3510c75

@ -41,6 +41,7 @@ _JOB_MODULES = [
'core.repair_jobs.single_album_dedup',
'core.repair_jobs.lossy_converter',
'core.repair_jobs.album_tag_consistency',
'core.repair_jobs.live_commentary_cleaner',
]

@ -0,0 +1,236 @@
"""Live/Commentary Cleaner Job — finds live, commentary, and interview content in the library."""
import re
from collections import defaultdict
from core.repair_jobs import register_job
from core.repair_jobs.base import JobContext, JobResult, RepairJob
from utils.logging_config import get_logger
logger = get_logger("repair_job.live_commentary_cleaner")
# Keywords that indicate unwanted content types
# Each tuple: (keyword, content_type_label)
_CONTENT_PATTERNS = [
# Live
(r'\blive\b', 'live'),
(r'\blive at\b', 'live'),
(r'\blive from\b', 'live'),
(r'\blive in\b', 'live'),
(r'\bin concert\b', 'live'),
(r'\bunplugged\b', 'live'),
# Commentary
(r'\bcommentary\b', 'commentary'),
(r'\bcommented\b', 'commentary'),
(r'\btrack.?by.?track\b', 'commentary'),
# Interview
(r'\binterview\b', 'interview'),
(r'\binterlude\b', 'interview'),
(r'\bskit\b', 'interview'),
# Spoken word
(r'\bspoken\s*word\b', 'spoken_word'),
(r'\bnarrat(?:ion|ed)\b', 'spoken_word'),
(r'\bintroduction\b', 'spoken_word'),
]
def _detect_content_type(title, album_title=''):
"""Check title and album for unwanted content keywords. Returns content_type or None."""
combined = f"{title} {album_title}".lower()
for pattern, content_type in _CONTENT_PATTERNS:
if re.search(pattern, combined):
return content_type
return None
def _format_type(content_type):
"""Format content type for display."""
return {
'live': 'Live',
'commentary': 'Commentary',
'interview': 'Interview/Skit',
'spoken_word': 'Spoken Word',
}.get(content_type, content_type.title())
@register_job
class LiveCommentaryCleanerJob(RepairJob):
job_id = 'live_commentary_cleaner'
display_name = 'Live/Commentary Cleaner'
description = 'Finds live performances, commentary, interviews, and spoken word content'
help_text = (
'Scans your library for tracks and albums that contain live performances, '
'commentary tracks, interviews, skits, or spoken word content based on '
'title keywords.\n\n'
'You can configure which content types to flag using the settings below. '
'Each finding shows the track, its content type, and the matched keyword.\n\n'
'Fix action: removes the track from the database and deletes the file. '
'If all tracks in an album are removed, the empty album is also cleaned up.\n\n'
'Settings:\n'
'- Flag Live: Flag live performances and concert recordings\n'
'- Flag Commentary: Flag commentary and track-by-track content\n'
'- Flag Interviews: Flag interviews, skits, and interludes\n'
'- Flag Spoken Word: Flag spoken word, narration, and introductions\n'
'- Scan Album Titles: Also check album titles (catches "Live at Wembley" albums)\n'
'- Scope: "tracks" flags individual tracks, "albums" flags entire albums with matching titles'
)
icon = 'repair-icon-filter'
default_enabled = False
default_interval_hours = 168
default_settings = {
'flag_live': True,
'flag_commentary': True,
'flag_interviews': True,
'flag_spoken_word': True,
'scan_album_titles': True,
'scope': 'tracks', # 'tracks' or 'albums'
}
auto_fix = False
def scan(self, context: JobContext) -> JobResult:
result = JobResult()
settings = self._get_settings(context)
enabled_types = set()
if settings.get('flag_live', True):
enabled_types.add('live')
if settings.get('flag_commentary', True):
enabled_types.add('commentary')
if settings.get('flag_interviews', True):
enabled_types.add('interview')
if settings.get('flag_spoken_word', True):
enabled_types.add('spoken_word')
if not enabled_types:
return result
scan_album_titles = settings.get('scan_album_titles', True)
scope = settings.get('scope', 'tracks')
conn = None
try:
conn = context.db._get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT t.id, t.title, ar.name, al.title, al.id, al.record_type,
t.file_path, t.bitrate, t.duration, t.track_number,
al.thumb_url, ar.thumb_url
FROM tracks t
LEFT JOIN artists ar ON ar.id = t.artist_id
LEFT JOIN albums al ON al.id = t.album_id
WHERE t.title IS NOT NULL AND t.title != ''
AND t.file_path IS NOT NULL AND t.file_path != ''
""")
tracks = cursor.fetchall()
except Exception as e:
logger.error("Error fetching tracks: %s", e, exc_info=True)
result.errors += 1
return result
finally:
if conn:
conn.close()
if not tracks:
return result
total = len(tracks)
if context.update_progress:
context.update_progress(0, total)
if context.report_progress:
context.report_progress(phase=f'Scanning {total} tracks...', total=total)
# Track which albums we've already flagged (for album scope)
flagged_album_ids = set()
for idx, row in enumerate(tracks):
if context.check_stop():
return result
result.scanned += 1
if idx % 200 == 0:
if context.report_progress:
context.report_progress(
scanned=idx, total=total,
phase=f'Scanning {idx} / {total}',
log_line=f'Checking: {row[1]}',
log_type='info'
)
if context.update_progress:
context.update_progress(idx, total)
(track_id, title, artist_name, album_title, album_id,
album_type, file_path, bitrate, duration, track_number,
album_thumb, artist_thumb) = row
# Check track title
content_type = _detect_content_type(title, '')
# Check album title if enabled and track title didn't match
album_matched = False
if not content_type and scan_album_titles and album_title:
content_type = _detect_content_type('', album_title)
if content_type:
album_matched = True
if not content_type:
continue
# Skip if this content type isn't enabled
if content_type not in enabled_types:
continue
# Album scope: flag once per album, not per track
if scope == 'albums' and album_matched and album_id:
if album_id in flagged_album_ids:
continue
flagged_album_ids.add(album_id)
type_label = _format_type(content_type)
match_source = f'album "{album_title}"' if album_matched else f'track "{title}"'
if context.create_finding:
try:
context.create_finding(
job_id=self.job_id,
finding_type='unwanted_content',
severity='info',
entity_type='album' if (scope == 'albums' and album_matched) else 'track',
entity_id=str(album_id if (scope == 'albums' and album_matched) else track_id),
file_path=file_path,
title=f'{type_label}: {title} by {artist_name or "Unknown"}',
description=(
f'{type_label} content detected in {match_source}. '
f'Album: "{album_title or "Unknown"}" ({album_type or "unknown"} type).'
),
details={
'track': {
'id': track_id,
'title': title,
'artist': artist_name or '',
'album': album_title or '',
'album_id': album_id,
'album_type': album_type or '',
'file_path': file_path,
'bitrate': bitrate,
'duration': duration,
'track_number': track_number,
},
'content_type': content_type,
'type_label': type_label,
'album_matched': album_matched,
'album_thumb_url': album_thumb or None,
'artist_thumb_url': artist_thumb or None,
}
)
result.findings_created += 1
except Exception as e:
logger.debug("Error creating finding: %s", e)
result.errors += 1
if context.update_progress:
context.update_progress(total, total)
logger.info("Live/Commentary cleaner: scanned %d tracks, found %d",
result.scanned, result.findings_created)
return result

@ -815,6 +815,7 @@ class RepairWorker:
'incomplete_album': self._fix_incomplete_album,
'path_mismatch': self._fix_path_mismatch,
'missing_lossy_copy': self._fix_missing_lossy_copy,
'unwanted_content': self._fix_unwanted_content,
}
handler = handlers.get(finding_type)
if not handler:
@ -1278,6 +1279,73 @@ class RepairWorker:
msg += ' (file deleted)'
return {'success': True, 'action': 'removed_single', 'message': msg}
def _fix_unwanted_content(self, entity_type, entity_id, file_path, details):
"""Remove unwanted content (live, commentary, interview, spoken word) from library."""
track_info = details.get('track', {})
track_id = track_info.get('id') or entity_id
track_path = track_info.get('file_path') or file_path
type_label = details.get('type_label', 'Unwanted')
if not track_id:
return {'success': False, 'error': 'No track ID to remove'}
# Remove from DB
conn = None
album_id = track_info.get('album_id')
try:
conn = self.db._get_connection()
cursor = conn.cursor()
cursor.execute("DELETE FROM tracks WHERE id = ?", (track_id,))
conn.commit()
removed = cursor.rowcount
# Check if album is now empty — clean it up too
if removed and album_id:
cursor.execute("SELECT COUNT(*) FROM tracks WHERE album_id = ?", (album_id,))
remaining = cursor.fetchone()[0]
if remaining == 0:
cursor.execute("DELETE FROM albums WHERE id = ?", (album_id,))
conn.commit()
logger.info("Cleaned up empty album (id=%s) after removing last track", album_id)
except Exception as e:
return {'success': False, 'error': f'DB error: {e}'}
finally:
if conn:
conn.close()
if removed == 0:
return {'success': True, 'action': 'already_removed', 'message': 'Track was already removed'}
# Delete file from disk
file_deleted = False
if track_path:
download_folder = None
if self._config_manager:
download_folder = self._config_manager.get('soulseek.download_path', '')
try:
resolved = _resolve_file_path(track_path, self.transfer_folder, download_folder)
if resolved and os.path.exists(resolved):
os.remove(resolved)
file_deleted = True
# Clean up empty parent directories
transfer_norm = os.path.normpath(self.transfer_folder)
parent = os.path.dirname(resolved)
for _ in range(3):
if (parent and os.path.isdir(parent)
and os.path.normpath(parent) != transfer_norm
and not os.listdir(parent)):
os.rmdir(parent)
parent = os.path.dirname(parent)
else:
break
except OSError:
pass # Best effort — DB entry already removed
msg = f'{type_label} track removed from library'
if file_deleted:
msg += ' (file deleted)'
return {'success': True, 'action': 'removed_content', 'message': msg}
def _fix_mbid_mismatch(self, entity_type, entity_id, file_path, details):
"""Remove the mismatched MusicBrainz recording ID from the audio file."""
if not file_path:

Loading…
Cancel
Save