You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/metadata/canonical_resolver.py

282 lines
12 KiB

"""Resolve (and persist) the canonical release for an album — Stage 2 of #765.
Stage 1 gave us the pure scorer (``core.metadata.canonical_version``). This
module turns it into an end-to-end resolver: gather the album's candidate
releases (one per metadata-source ID it has), score each against the on-disk
files, and return the best fit. Wiring (backfill job / enrichment hook) and the
DB store live alongside; the decision logic here is kept dependency-injected
(``fetch_tracklist`` is passed in) so it's fully unit-testable without live APIs
or real files.
Still NO consumer reads the result in Stage 2 — populating the columns is
behavior-neutral. Stages 3-4 wire the Reorganizer and Track Number Repair to
read it.
"""
from __future__ import annotations
from typing import Any, Callable, Dict, List, Optional
from core.metadata.canonical_version import (
score_release_against_files,
score_release_detail,
)
# Source-selection modes (a per-job setting). See resolve_canonical_for_album.
MODE_ACTIVE_PREFERRED = "active_preferred" # default: use the active source if it fits, else best-fit
MODE_ACTIVE_ONLY = "active_only" # only ever the active source
MODE_BEST_FIT = "best_fit" # whichever source fits the files best
VALID_MODES = (MODE_ACTIVE_PREFERRED, MODE_ACTIVE_ONLY, MODE_BEST_FIT)
def resolve_canonical_for_album(
*,
album_source_ids: Dict[str, str],
file_tracks: List[Dict[str, Any]],
fetch_tracklist: Callable[[str, str], Optional[List[Dict[str, Any]]]],
source_priority: List[str],
min_score: float = 0.5,
mode: str = MODE_ACTIVE_PREFERRED,
primary_source: Optional[str] = None,
) -> Optional[Dict[str, Any]]:
"""Pick the canonical release for one album, honoring the source-selection mode.
``album_source_ids``: ``{source: album_id}`` the album is linked to.
``file_tracks``: on-disk track metadata (``{duration_ms, title}``).
``fetch_tracklist(source, album_id)``: returns that release's tracklist (or
None/[] on miss); injected so callers supply ``get_album_tracks_for_source``
while tests supply a fake.
``source_priority``: source order; ties break toward the earlier source.
``primary_source``: the user's active metadata source (defaults to the first
of ``source_priority``).
Modes:
- ``active_preferred`` (default): use the active source's release when the
album has an ID for it AND it clears ``min_score``; otherwise fall back
to the best-fit among the remaining sources. So it normally respects the
user's configured source but self-heals when that link is clearly wrong.
- ``active_only``: only ever the active source (pinned if it clears the
floor; never considers other sources).
- ``best_fit``: whichever source's release best matches the files.
Returns an enriched dict for the chosen release — ``source``, ``album_id``,
``score``, the per-signal breakdown (``count_fit``/``duration_fit``/
``title_fit``), ``file_track_count`` vs ``release_track_count``, and a
``candidates`` list of everything it scored (so a finding can show WHY the
pick won and what it beat). ``None`` when there are no files, no resolvable
candidates, or nothing clears ``min_score``."""
if not file_tracks:
return None
primary = primary_source or (source_priority[0] if source_priority else None)
scored: List[Dict[str, Any]] = [] # every source we actually scored
def _score(source: Optional[str]) -> Optional[Dict[str, Any]]:
if not source or any(e['source'] == source for e in scored):
return next((e for e in scored if e['source'] == source), None)
album_id = album_source_ids.get(source)
if not album_id:
return None
try:
tracks = fetch_tracklist(source, str(album_id))
except Exception:
tracks = None
if not tracks:
return None
entry = {
'source': source, 'album_id': str(album_id),
'track_count': len(tracks), 'score': round(score_release_against_files(file_tracks, tracks), 4),
'_tracks': tracks,
}
scored.append(entry)
return entry
winner: Optional[Dict[str, Any]] = None
# Active-source modes: try the primary first.
if mode in (MODE_ACTIVE_ONLY, MODE_ACTIVE_PREFERRED):
p = _score(primary)
if p and p['score'] >= min_score:
winner = p
elif mode == MODE_ACTIVE_ONLY:
return None # never consider other sources
# best_fit, or active_preferred fallback: score the rest and pick the best.
if winner is None:
for source in source_priority:
_score(source)
best = None
for e in scored: # source_priority order -> strictly-greater = priority tiebreak
if best is None or e['score'] > best['score'] + 1e-9:
best = e
if best and best['score'] >= min_score:
winner = best
if winner is None:
return None
detail = score_release_detail(file_tracks, winner['_tracks'])
# Pinned-release track titles — already fetched, so free. Capped so a giant
# box set can't bloat the finding's details_json.
release_titles = [
(t.get('title') or t.get('name') or '') for t in winner['_tracks']
][:60]
return {
'source': winner['source'],
'album_id': winner['album_id'],
'score': winner['score'],
'file_track_count': detail['file_track_count'],
'release_track_count': detail['release_track_count'],
'count_fit': detail['count_fit'],
'duration_fit': detail['duration_fit'],
'title_fit': detail['title_fit'],
'release_track_titles': release_titles,
'candidates': [
{'source': e['source'], 'album_id': e['album_id'],
'track_count': e['track_count'], 'score': e['score']}
for e in scored
],
}
def _item_get(item: Any, key: str, default: Any = None) -> Any:
"""Read ``key`` from a track item that may be a dict or an object."""
return item.get(key, default) if isinstance(item, dict) else getattr(item, key, default)
def default_fetch_tracklist(source: str, album_id: str) -> Optional[List[Dict[str, Any]]]:
"""Production ``fetch_tracklist``: pull a release's tracklist from a metadata
source and normalise to ``{title, track_number, duration_ms}``. Duration is
best-effort (not every source exposes it); when absent the scorer just leans
on track-count + title. Returns None on any failure."""
try:
from core.metadata_service import get_album_tracks_for_source
data = get_album_tracks_for_source(source, album_id)
except Exception:
return None
items = data if isinstance(data, list) else (
(data.get('items') or data.get('tracks') or []) if isinstance(data, dict) else []
)
if isinstance(items, dict): # {'tracks': {'items': [...]}}
items = items.get('items') or []
out: List[Dict[str, Any]] = []
for it in items:
dur = _item_get(it, 'duration_ms')
if dur is None:
secs = _item_get(it, 'duration') # some sources give seconds
dur = int(secs * 1000) if isinstance(secs, (int, float)) and secs else None
out.append({
'title': _item_get(it, 'name') or _item_get(it, 'title') or '',
'track_number': _item_get(it, 'track_number'),
'duration_ms': dur,
})
return out or None
def _lookup_artist_thumb(db, artist_id) -> Optional[str]:
"""Best-effort artist thumb URL by id. Returns None on missing column / any
error (the artists table doesn't have thumb_url in every schema)."""
if not artist_id:
return None
conn = None
try:
conn = db._get_connection()
cursor = conn.cursor()
cursor.execute("PRAGMA table_info(artists)")
if 'thumb_url' not in {r[1] for r in cursor.fetchall()}:
return None
cursor.execute("SELECT thumb_url FROM artists WHERE id = ?", (str(artist_id),))
row = cursor.fetchone()
return (row[0] or None) if row else None
except Exception:
return None
finally:
if conn:
conn.close()
def resolve_and_store_canonical_for_album(
db,
album_id,
*,
fetch_tracklist: Optional[Callable[[str, str], Any]] = None,
source_priority: Optional[List[str]] = None,
min_score: float = 0.5,
store: bool = True,
mode: str = MODE_ACTIVE_PREFERRED,
) -> Optional[Dict[str, Any]]:
"""Gather an album's source IDs + its tracks' (duration, title) from the DB,
resolve the best-fit canonical release, and (when ``store``) persist it.
Returns the resolved ``{source, album_id, score}`` or None when unresolved.
``store=False`` resolves without writing — used by the backfill job's dry run.
Uses the SAME album/source-id loader the Reorganizer uses
(``load_album_and_tracks`` + ``_extract_source_ids``) so the canonical is
chosen over exactly the source IDs the reorganizer sees. Scores off the DB
track rows' ``duration`` (stored in ms) + ``title`` — the library's view of
the files — so no per-file disk reads are needed."""
from core.library_reorganize import _extract_source_ids, load_album_and_tracks
album_data, tracks = load_album_and_tracks(db, album_id)
if not album_data or not tracks:
return None
source_ids = {s: v for s, v in _extract_source_ids(album_data).items() if v}
if not source_ids:
return None
file_tracks = [
{'duration_ms': t.get('duration') or 0, 'title': t.get('title') or ''}
for t in tracks
]
if fetch_tracklist is None:
fetch_tracklist = default_fetch_tracklist
primary_source = None
if source_priority is None:
try:
from core.metadata_service import get_primary_source, get_source_priority
primary_source = get_primary_source()
source_priority = get_source_priority(primary_source)
except Exception:
source_priority = list(source_ids.keys())
result = resolve_canonical_for_album(
album_source_ids=source_ids,
file_tracks=file_tracks,
fetch_tracklist=fetch_tracklist,
source_priority=source_priority,
min_score=min_score,
mode=mode,
primary_source=primary_source,
)
if result:
# Album/artist/art context for richer findings (read from the row we
# already loaded — no extra query). Storage only uses source/id/score.
result['album_title'] = album_data.get('title') or ''
result['artist_name'] = album_data.get('artist_name') or ''
# Free context off the album row + the data we already gathered.
if album_data.get('year'):
result['year'] = album_data['year']
result['db_track_count'] = album_data.get('track_count') or len(file_tracks)
if album_data.get('duration'):
result['db_duration_ms'] = album_data['duration']
result['linked_sources'] = source_ids # {source: album_id} the album points at now
result['file_track_titles'] = [ft.get('title') or '' for ft in file_tracks][:60]
if album_data.get('thumb_url'):
result['album_thumb_url'] = album_data['thumb_url']
# Artist thumb via a guarded lookup (not the shared album loader — some
# schemas have no artists.thumb_url column). Only runs for resolved
# albums, so no cost on the no-source-id short-circuit majority.
artist_thumb = _lookup_artist_thumb(db, album_data.get('artist_id'))
if artist_thumb:
result['artist_thumb_url'] = artist_thumb
if store:
db.set_album_canonical(album_id, result['source'], result['album_id'], result['score'])
return result
__all__ = [
"resolve_canonical_for_album",
"resolve_and_store_canonical_for_album",
"default_fetch_tracklist",
]