diff --git a/core/library/embedded_id_reconcile.py b/core/library/embedded_id_reconcile.py new file mode 100644 index 00000000..06f20711 --- /dev/null +++ b/core/library/embedded_id_reconcile.py @@ -0,0 +1,299 @@ +"""Reconcile provider IDs embedded in audio files into the library DB. + +Enrichment workers (Spotify / iTunes / MusicBrainz / Deezer / Tidal / +AudioDB / Genius / Last.fm) resolve each artist / album / track to a provider ID +via API calls, gating their work queues on ``{provider}_match_status IS +NULL``. But files that SoulSync (or MusicBrainz Picard) already tagged +carry those IDs in their metadata. Reading them back and gap-filling the +``{provider}_id`` + ``{provider}_match_status = 'matched'`` columns lets +the workers skip the API lookup entirely — large API savings on an +already-tagged library. + +Split into a PURE planning layer and a thin DB apply layer: + +- :func:`plan_reconcile` takes the tags read from ONE file (via + ``core.library.file_tags.read_embedded_tags``) plus the current IDs of + that file's track + its parent album + artist, and produces the list of + :class:`Fill` operations to perform. It is gap-fill only: a provider id + that already has a value is never planned for change; a DISAGREEING + embedded id is reported as a conflict instead. + +- :func:`apply_reconcile_plan` writes a plan, one guarded ``UPDATE`` per + id column: ``WHERE id = ? AND ({id_col} IS NULL OR {id_col} = '')``. + The guard makes the gap-fill ATOMIC — even if an enrichment worker + matched the same entity between the plan's read and this write, the + fill simply affects 0 rows instead of clobbering the worker's value. + Columns are introspected first so a schema version missing a provider's + columns is skipped, not errored. + +Scope note: the MusicBrainz *recording* (track) ID is intentionally not +reconciled — on ID3 it lives in a ``UFID`` frame the shared reader +doesn't surface and the Vorbis ``musicbrainz_trackid`` convention is +format-ambiguous. MB *album* and *artist* IDs (which drive most worker +API calls) ARE reconciled, as are the clean per-provider track/album/ +artist IDs of the other services. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +# Each entry: (embedded-tag key from read_embedded_tags, entity, id column, +# match-status column). The id columns mirror web_server._SERVICE_ID_COLUMNS; +# they're spelled out here so this module stays importable without the Flask +# app. Single-column providers (deezer/tidal/audiodb/genius) reuse one id +# column across entity types — that's fine, fills are keyed by (entity, col). +_RECONCILE_FIELDS = ( + ('spotify_track_id', 'track', 'spotify_track_id', 'spotify_match_status'), + ('spotify_album_id', 'album', 'spotify_album_id', 'spotify_match_status'), + ('spotify_artist_id', 'artist', 'spotify_artist_id', 'spotify_match_status'), + ('itunes_track_id', 'track', 'itunes_track_id', 'itunes_match_status'), + ('itunes_album_id', 'album', 'itunes_album_id', 'itunes_match_status'), + ('itunes_artist_id', 'artist', 'itunes_artist_id', 'itunes_match_status'), + ('musicbrainz_albumid', 'album', 'musicbrainz_release_id', 'musicbrainz_match_status'), + ('musicbrainz_artistid', 'artist', 'musicbrainz_id', 'musicbrainz_match_status'), + ('deezer_track_id', 'track', 'deezer_id', 'deezer_match_status'), + ('deezer_album_id', 'album', 'deezer_id', 'deezer_match_status'), + ('deezer_artist_id', 'artist', 'deezer_id', 'deezer_match_status'), + ('tidal_track_id', 'track', 'tidal_id', 'tidal_match_status'), + ('tidal_album_id', 'album', 'tidal_id', 'tidal_match_status'), + ('tidal_artist_id', 'artist', 'tidal_id', 'tidal_match_status'), + ('audiodb_track_id', 'track', 'audiodb_id', 'audiodb_match_status'), + ('audiodb_album_id', 'album', 'audiodb_id', 'audiodb_match_status'), + ('audiodb_artist_id', 'artist', 'audiodb_id', 'audiodb_match_status'), + ('genius_track_id', 'track', 'genius_id', 'genius_match_status'), + # Last.fm embeds a single LASTFM_URL — sourced from get_track_info(), so it + # is the TRACK's url. Map to tracks.lastfm_url only (artist/album last.fm + # urls are different urls and aren't carried in the file). + ('lastfm_url', 'track', 'lastfm_url', 'lastfm_match_status'), +) + +_ENTITIES = ('track', 'album', 'artist') +_ENTITY_TABLE = {'track': 'tracks', 'album': 'albums', 'artist': 'artists'} + + +@dataclass(frozen=True) +class Fill: + """One provider-id column to gap-fill on one entity.""" + entity: str # 'track' | 'album' | 'artist' + id_column: str # e.g. 'spotify_artist_id' + status_column: str # e.g. 'spotify_match_status' + value: str # the embedded id to write + + +@dataclass +class ReconcilePlan: + """The outcome of planning one file against its current DB rows. + + ``fills`` are the gap-fill operations to apply (empty id columns only). + ``already_present`` counts embedded ids that matched a value already + stored (no-op). ``conflicts`` lists embedded ids that DISAGREE with a + stored value — never applied, surfaced for review. + """ + + fills: List[Fill] = field(default_factory=list) + already_present: int = 0 + conflicts: List[Dict[str, str]] = field(default_factory=list) + + @property + def filled(self) -> int: + return len(self.fills) + + @property + def has_updates(self) -> bool: + return bool(self.fills) + + def fills_for(self, entity: str) -> List[Fill]: + return [f for f in self.fills if f.entity == entity] + + +@dataclass +class ReconcileApplied: + """Counts from actually writing a plan (based on real ``rowcount``).""" + rows_updated: int = 0 # distinct entity rows touched + ids_filled: int = 0 # id columns that actually landed (guard passed) + + +def _clean(value: Any) -> Optional[str]: + """Normalise a tag/column value to a non-empty stripped string or None.""" + if value is None: + return None + s = str(value).strip() + return s or None + + +def plan_reconcile( + embedded_tags: Optional[Dict[str, Any]], + current_ids: Optional[Dict[str, Dict[str, Any]]], +) -> ReconcilePlan: + """Plan which provider-ID columns to gap-fill from one file's tags. + + Args: + embedded_tags: the ``tags`` dict from ``read_embedded_tags`` (flat + ``friendly_key -> value``). ``None`` / empty yields an empty plan. + current_ids: ``{'track': {...}, 'album': {...}, 'artist': {...}}`` + where each inner dict holds the entity's CURRENT column values + (at minimum the id columns this module touches). Missing + entities / keys are treated as empty (eligible to fill). + + Returns: + A :class:`ReconcilePlan`. Gap-fill only — an id column with any + existing value is never planned; a disagreeing embedded id is + recorded in ``conflicts``. + """ + plan = ReconcilePlan() + tags = embedded_tags or {} + current = current_ids or {} + queued: Dict[tuple, str] = {} # (entity, id_col) already queued this pass + + for embedded_key, entity, id_col, status_col in _RECONCILE_FIELDS: + new_val = _clean(tags.get(embedded_key)) + if not new_val: + continue + + row = current.get(entity) or {} + existing = _clean(row.get(id_col)) + if existing is not None: + if existing != new_val: + plan.conflicts.append({ + 'entity': entity, 'column': id_col, + 'existing': existing, 'embedded': new_val, + }) + else: + plan.already_present += 1 + continue + + key = (entity, id_col) + if key in queued: + # A single-column provider already queued this id col this pass. + if queued[key] != new_val: + plan.conflicts.append({ + 'entity': entity, 'column': id_col, + 'existing': queued[key], 'embedded': new_val, + }) + continue + + queued[key] = new_val + plan.fills.append(Fill(entity, id_col, status_col, new_val)) + + return plan + + +@dataclass +class TrackReconcileResult: + """Outcome of reconciling one track row against its file's tags.""" + applied: 'ReconcileApplied' + conflicts: int = 0 + readable: bool = True # False when the file's tags couldn't be read + + +def reconcile_track_row( + cursor, + track_row: Dict[str, Any], + album_map: Dict[str, Dict[str, Any]], + artist_map: Dict[str, Dict[str, Any]], + embedded_tags: Optional[Dict[str, Any]], +) -> TrackReconcileResult: + """Reconcile one track row + its parent album/artist against one file. + + Pure orchestration over :func:`plan_reconcile` / :func:`apply_reconcile_plan`, + extracted so the per-track logic (id extraction, plan→apply chaining, + keeping the in-memory parent maps fresh for sibling tracks) is testable + without the Flask job. ``embedded_tags`` is the ``tags`` dict from + ``read_embedded_tags`` (``None`` => unreadable file). + + ``album_map`` / ``artist_map`` map entity-id -> current column dict; this + function UPDATES them in place with any fills it applies so a later track + on the same album/artist sees the value and doesn't re-plan it. (DB safety + is the guarded UPDATE in apply, never these maps.) + """ + if not embedded_tags: + return TrackReconcileResult(ReconcileApplied(), 0, readable=False) + + album_id = str(track_row['album_id']) if track_row.get('album_id') is not None else None + artist_id = str(track_row['artist_id']) if track_row.get('artist_id') is not None else None + + plan = plan_reconcile(embedded_tags, { + 'track': track_row, + 'album': album_map.get(album_id, {}) if album_id else {}, + 'artist': artist_map.get(artist_id, {}) if artist_id else {}, + }) + applied = apply_reconcile_plan(cursor, { + 'track': track_row.get('id'), 'album': album_id, 'artist': artist_id, + }, plan) + + if album_id: + for f in plan.fills_for('album'): + album_map.setdefault(album_id, {})[f.id_column] = f.value + if artist_id: + for f in plan.fills_for('artist'): + artist_map.setdefault(artist_id, {})[f.id_column] = f.value + + return TrackReconcileResult(applied, len(plan.conflicts), readable=True) + + +def _existing_columns(cursor, table: str) -> set: + """Return the set of column names on ``table`` (migration-safe guard).""" + cursor.execute(f"PRAGMA table_info({table})") + return {r[1] for r in cursor.fetchall()} + + +def apply_reconcile_plan(cursor, entity_ids: Dict[str, Any], plan: ReconcilePlan) -> ReconcileApplied: + """Apply a :class:`ReconcilePlan` to the DB via ``cursor``. + + Each fill is a single guarded ``UPDATE``: + + UPDATE {table} SET {id}=?, {status}='matched', {attempted}=now + WHERE id=? AND ({id} IS NULL OR {id}='') + + The ``id IS NULL OR id=''`` guard makes the gap-fill atomic: if the + column became non-empty between the plan's read and now (an enrichment + worker matched it concurrently), the UPDATE affects 0 rows and the + worker's value is preserved. Only columns that exist on the table are + written (introspected + cached per call), so a schema missing a + provider's columns is silently skipped. + + Args: + cursor: an open DB cursor (caller owns the transaction/commit). + entity_ids: ``{'track': id, 'album': id, 'artist': id}``. An entity + with no id is skipped. + + Returns: + A :class:`ReconcileApplied` with counts derived from real rowcounts. + """ + result = ReconcileApplied() + touched: set = set() + col_cache: Dict[str, set] = {} + + for fill in plan.fills: + ent_id = entity_ids.get(fill.entity) + if ent_id is None or ent_id == '': + continue + table = _ENTITY_TABLE[fill.entity] + if table not in col_cache: + col_cache[table] = _existing_columns(cursor, table) + cols = col_cache[table] + if fill.id_column not in cols: + continue + + assignments = [f"{fill.id_column} = ?"] + values: List[Any] = [fill.value] + if fill.status_column in cols: + assignments.append(f"{fill.status_column} = ?") + values.append('matched') + attempted = fill.status_column.replace('_match_status', '_last_attempted') + if attempted in cols: + assignments.append(f"{attempted} = CURRENT_TIMESTAMP") + + cursor.execute( + f"UPDATE {table} SET {', '.join(assignments)} " + f"WHERE id = ? AND ({fill.id_column} IS NULL OR {fill.id_column} = '')", + values + [str(ent_id)], + ) + if cursor.rowcount: + result.ids_filled += 1 + touched.add((fill.entity, str(ent_id))) + + result.rows_updated = len(touched) + return result diff --git a/tests/library/test_embedded_id_reconcile.py b/tests/library/test_embedded_id_reconcile.py new file mode 100644 index 00000000..9c95b896 --- /dev/null +++ b/tests/library/test_embedded_id_reconcile.py @@ -0,0 +1,273 @@ +"""Tests for core/library/embedded_id_reconcile.py. + +The reconcile job reads provider IDs already embedded in a file's tags +(by SoulSync or MusicBrainz Picard) and gap-fills them into the library +DB so enrichment workers skip the API call. These pin the guarantees that +make it safe to run across a whole library while workers run concurrently: + + 1. gap-fill only — an existing id is NEVER overwritten, + 2. disagreements are reported as conflicts, not applied, + 3. the write is ATOMICALLY guarded — if a worker fills the column + between plan and apply, the apply no-ops (no clobber). +""" + +from __future__ import annotations + +import sqlite3 + +from core.library.embedded_id_reconcile import ( + Fill, + ReconcileApplied, + ReconcilePlan, + apply_reconcile_plan, + plan_reconcile, + reconcile_track_row, +) + + +# --------------------------------------------------------------------------- +# plan_reconcile — the pure planning layer +# --------------------------------------------------------------------------- + +def test_empty_inputs_yield_empty_plan(): + plan = plan_reconcile(None, None) + assert isinstance(plan, ReconcilePlan) + assert plan.has_updates is False + assert plan.filled == 0 + assert plan.conflicts == [] + + +def test_fills_all_three_entities_from_one_file(): + tags = {'spotify_track_id': 'TRK', 'spotify_album_id': 'ALB', 'spotify_artist_id': 'ART'} + plan = plan_reconcile(tags, {'track': {}, 'album': {}, 'artist': {}}) + + assert plan.filled == 3 + by_entity = {(f.entity, f.id_column): f.value for f in plan.fills} + assert by_entity[('track', 'spotify_track_id')] == 'TRK' + assert by_entity[('album', 'spotify_album_id')] == 'ALB' + assert by_entity[('artist', 'spotify_artist_id')] == 'ART' + # status column pairing is carried on each Fill + track_fill = plan.fills_for('track')[0] + assert track_fill.status_column == 'spotify_match_status' + + +def test_never_overwrites_an_existing_id(): + plan = plan_reconcile({'spotify_artist_id': 'NEW'}, + {'artist': {'spotify_artist_id': 'EXISTING'}}) + assert plan.filled == 0 + assert plan.fills_for('artist') == [] + assert len(plan.conflicts) == 1 + c = plan.conflicts[0] + assert c['existing'] == 'EXISTING' and c['embedded'] == 'NEW' and c['entity'] == 'artist' + + +def test_matching_existing_id_is_noop_not_conflict(): + plan = plan_reconcile({'spotify_artist_id': 'SAME'}, + {'artist': {'spotify_artist_id': 'SAME'}}) + assert plan.filled == 0 + assert plan.conflicts == [] + assert plan.already_present == 1 + + +def test_blank_and_whitespace_values_ignored(): + tags = {'spotify_artist_id': ' ', 'spotify_album_id': '', 'itunes_track_id': None} + plan = plan_reconcile(tags, {'track': {}, 'album': {}, 'artist': {}}) + assert plan.has_updates is False + + +def test_whitespace_padded_embedded_id_is_trimmed_and_filled(): + plan = plan_reconcile({'spotify_track_id': ' TRK '}, {'track': {}}) + assert plan.fills_for('track')[0].value == 'TRK' + + +def test_single_column_provider_maps_per_entity(): + # Deezer/Tidal/AudioDB reuse one id column across entity types; fills + # must be keyed by entity so they don't collide. + tags = {'deezer_track_id': 'DT', 'deezer_album_id': 'DA', 'deezer_artist_id': 'DR'} + plan = plan_reconcile(tags, {'track': {}, 'album': {}, 'artist': {}}) + vals = {f.entity: f.value for f in plan.fills} + assert vals == {'track': 'DT', 'album': 'DA', 'artist': 'DR'} + assert plan.filled == 3 + + +def test_mb_album_and_artist_filled_track_recording_skipped(): + tags = {'musicbrainz_albumid': 'MBA', 'musicbrainz_artistid': 'MBR', 'musicbrainz_trackid': 'MBT'} + plan = plan_reconcile(tags, {'track': {}, 'album': {}, 'artist': {}}) + cols = {(f.entity, f.id_column): f.value for f in plan.fills} + assert cols[('album', 'musicbrainz_release_id')] == 'MBA' + assert cols[('artist', 'musicbrainz_id')] == 'MBR' + assert plan.fills_for('track') == [] # recording id not reconciled + + +def test_lastfm_url_maps_to_track_only(): + # The file carries a single LASTFM_URL = the TRACK's last.fm url. It must + # fill tracks.lastfm_url and NOT be smeared onto album/artist (whose + # last.fm urls are different urls entirely). + plan = plan_reconcile({'lastfm_url': 'https://last.fm/music/A/_/Song'}, + {'track': {}, 'album': {}, 'artist': {}}) + assert plan.filled == 1 + f = plan.fills_for('track')[0] + assert f.id_column == 'lastfm_url' and f.status_column == 'lastfm_match_status' + assert plan.fills_for('album') == [] and plan.fills_for('artist') == [] + + +def test_partial_fill_when_one_entity_already_matched(): + tags = {'spotify_artist_id': 'ART', 'spotify_album_id': 'ALB'} + current = {'artist': {'spotify_artist_id': 'ART'}, 'album': {}} + plan = plan_reconcile(tags, current) + assert plan.filled == 1 + assert plan.fills_for('album')[0].value == 'ALB' + assert plan.fills_for('artist') == [] + assert plan.already_present == 1 + + +# --------------------------------------------------------------------------- +# apply_reconcile_plan — the DB layer (in-memory sqlite) +# --------------------------------------------------------------------------- + +def _make_db(): + conn = sqlite3.connect(':memory:') + cur = conn.cursor() + for table, idcol in (('tracks', 'spotify_track_id'), ('albums', 'spotify_album_id'), + ('artists', 'spotify_artist_id')): + cur.execute(f"""CREATE TABLE {table} (id TEXT PRIMARY KEY, {idcol} TEXT, + spotify_match_status TEXT, spotify_last_attempted TIMESTAMP)""") + cur.execute("INSERT INTO tracks (id) VALUES ('t1')") + cur.execute("INSERT INTO albums (id) VALUES ('al1')") + cur.execute("INSERT INTO artists (id) VALUES ('ar1')") + conn.commit() + return conn, cur + + +def test_apply_writes_ids_status_and_timestamp(): + conn, cur = _make_db() + plan = plan_reconcile( + {'spotify_track_id': 'TRK', 'spotify_album_id': 'ALB', 'spotify_artist_id': 'ART'}, + {'track': {}, 'album': {}, 'artist': {}}, + ) + applied = apply_reconcile_plan(cur, {'track': 't1', 'album': 'al1', 'artist': 'ar1'}, plan) + conn.commit() + assert isinstance(applied, ReconcileApplied) + assert applied.rows_updated == 3 and applied.ids_filled == 3 + + cur.execute("SELECT spotify_track_id, spotify_match_status, spotify_last_attempted FROM tracks WHERE id='t1'") + tid, status, attempted = cur.fetchone() + assert tid == 'TRK' and status == 'matched' and attempted is not None + + +def test_apply_guard_blocks_overwrite_under_concurrency(): + # THE headline hardening: a worker fills the column AFTER we planned + # (plan saw empty) but BEFORE we apply. The guarded UPDATE must no-op + # and leave the worker's value intact. + conn, cur = _make_db() + plan = plan_reconcile({'spotify_artist_id': 'FROM_FILE'}, {'artist': {}}) # planned: empty + # Simulate a concurrent enrichment worker matching it in the meantime. + cur.execute("UPDATE artists SET spotify_artist_id='FROM_WORKER', spotify_match_status='matched' WHERE id='ar1'") + conn.commit() + + applied = apply_reconcile_plan(cur, {'artist': 'ar1'}, plan) + conn.commit() + assert applied.ids_filled == 0 and applied.rows_updated == 0 # guard blocked it + + cur.execute("SELECT spotify_artist_id FROM artists WHERE id='ar1'") + assert cur.fetchone()[0] == 'FROM_WORKER' # worker's value preserved + + +def test_apply_guard_treats_empty_string_as_fillable(): + conn, cur = _make_db() + cur.execute("UPDATE artists SET spotify_artist_id='' WHERE id='ar1'") # empty string, not NULL + conn.commit() + plan = plan_reconcile({'spotify_artist_id': 'ART'}, {'artist': {}}) + applied = apply_reconcile_plan(cur, {'artist': 'ar1'}, plan) + conn.commit() + assert applied.ids_filled == 1 + cur.execute("SELECT spotify_artist_id FROM artists WHERE id='ar1'") + assert cur.fetchone()[0] == 'ART' + + +def test_apply_skips_unknown_columns_without_erroring(): + # Schema missing a provider's columns must not raise — the plan targets + # tidal_id which this minimal schema lacks; it's silently skipped. + conn, cur = _make_db() + plan = plan_reconcile({'tidal_artist_id': 'TID', 'spotify_artist_id': 'ART'}, + {'track': {}, 'album': {}, 'artist': {}}) + applied = apply_reconcile_plan(cur, {'artist': 'ar1'}, plan) + conn.commit() + cur.execute("SELECT spotify_artist_id FROM artists WHERE id='ar1'") + assert cur.fetchone()[0] == 'ART' + assert applied.ids_filled == 1 # only the existing spotify column landed + + +def test_apply_skips_entity_with_no_id(): + conn, cur = _make_db() + plan = plan_reconcile({'spotify_album_id': 'ALB'}, {'album': {}}) + applied = apply_reconcile_plan(cur, {'track': 't1'}, plan) # no album id supplied + assert applied.rows_updated == 0 and applied.ids_filled == 0 + + +def test_apply_empty_plan_is_noop(): + conn, cur = _make_db() + applied = apply_reconcile_plan(cur, {'track': 't1'}, ReconcilePlan()) + assert applied.rows_updated == 0 and applied.ids_filled == 0 + + +# --------------------------------------------------------------------------- +# reconcile_track_row — the per-track orchestration (id extraction, plan→apply, +# sibling-map freshening) +# --------------------------------------------------------------------------- + +def test_reconcile_track_row_unreadable_file_is_noop(): + conn, cur = _make_db() + result = reconcile_track_row(cur, {'id': 't1'}, {}, {}, None) + assert result.readable is False + assert result.applied.ids_filled == 0 + + +def test_reconcile_track_row_fills_track_and_parents(): + conn, cur = _make_db() + track_row = {'id': 't1', 'album_id': 'al1', 'artist_id': 'ar1'} + album_map = {'al1': {}} + artist_map = {'ar1': {}} + tags = {'spotify_track_id': 'TRK', 'spotify_album_id': 'ALB', 'spotify_artist_id': 'ART'} + result = reconcile_track_row(cur, track_row, album_map, artist_map, tags) + conn.commit() + assert result.readable is True + assert result.applied.ids_filled == 3 and result.applied.rows_updated == 3 + # parent maps were freshened in place + assert album_map['al1']['spotify_album_id'] == 'ALB' + assert artist_map['ar1']['spotify_artist_id'] == 'ART' + + +def test_reconcile_sibling_tracks_dont_refill_shared_parent(): + # Two tracks on the same album/artist. The first fills the album+artist + # ids; the second must see them already present (via the freshened map) + # and NOT re-apply — proving the map keeps siblings from redundant work. + conn, cur = _make_db() + cur.execute("INSERT INTO tracks (id) VALUES ('t2')") + conn.commit() + album_map = {'al1': {}} + artist_map = {'ar1': {}} + tags = {'spotify_album_id': 'ALB', 'spotify_artist_id': 'ART', 'spotify_track_id': 'T1'} + + r1 = reconcile_track_row(cur, {'id': 't1', 'album_id': 'al1', 'artist_id': 'ar1'}, + album_map, artist_map, tags) + # Second track: same album/artist ids embedded, its own track id. + tags2 = {'spotify_album_id': 'ALB', 'spotify_artist_id': 'ART', 'spotify_track_id': 'T2'} + r2 = reconcile_track_row(cur, {'id': 't2', 'album_id': 'al1', 'artist_id': 'ar1'}, + album_map, artist_map, tags2) + conn.commit() + + assert r1.applied.ids_filled == 3 # track + album + artist + assert r2.applied.ids_filled == 1 # only t2's own track id; parents already filled + assert r2.conflicts == 0 + + +def test_reconcile_track_row_handles_null_parent_ids(): + conn, cur = _make_db() + # Track with no album/artist linkage — only its own id should fill. + result = reconcile_track_row(cur, {'id': 't1', 'album_id': None, 'artist_id': None}, + {}, {}, {'spotify_track_id': 'TRK', 'spotify_album_id': 'ALB'}) + conn.commit() + assert result.applied.ids_filled == 1 # album fill has no album id to land on + cur.execute("SELECT spotify_track_id FROM tracks WHERE id='t1'") + assert cur.fetchone()[0] == 'TRK' diff --git a/web_server.py b/web_server.py index 2978693f..4b7d4b24 100644 --- a/web_server.py +++ b/web_server.py @@ -9828,6 +9828,133 @@ def get_write_tags_batch_status(): return jsonify(state) +# ── Reconcile embedded provider IDs (gap-fill DB from file tags) ── +# +# Files that SoulSync (or MusicBrainz Picard) already tagged carry Spotify / +# iTunes / MusicBrainz / Deezer / Tidal / AudioDB / Genius IDs in their +# metadata. Reading them back and gap-filling the {provider}_id + +# {provider}_match_status='matched' columns lets the enrichment workers skip +# the API lookup entirely — large API savings on an already-tagged library. +# Gap-fill only: an existing id is never overwritten (see +# core/library/embedded_id_reconcile.py). +_reconcile_ids_state = { + 'status': 'idle', # idle | running | done + 'total': 0, + 'processed': 0, + 'entities_updated': 0, # track/album/artist rows written + 'ids_filled': 0, # individual id columns filled + 'conflicts': 0, # embedded id disagreed with a stored id (not applied) + 'unreadable': 0, # files missing / unreadable by mutagen + 'current': '', +} +_reconcile_ids_lock = threading.Lock() + + +@app.route('/api/library/reconcile-embedded-ids', methods=['POST']) +def reconcile_embedded_ids(): + """Scan every library file for embedded provider IDs and gap-fill them + into the DB so enrichment workers skip the API lookup. Runs in the + background; poll the status endpoint for progress.""" + try: + with _reconcile_ids_lock: + if _reconcile_ids_state['status'] == 'running': + return jsonify({"success": False, "error": "A reconcile is already in progress"}), 409 + _reconcile_ids_state.update({ + 'status': 'running', 'total': 0, 'processed': 0, + 'entities_updated': 0, 'ids_filled': 0, 'conflicts': 0, + 'unreadable': 0, 'current': 'Starting…', + }) + + database = get_database() + + def _run(): + from core.library.file_tags import read_embedded_tags + from core.library.embedded_id_reconcile import reconcile_track_row + conn = None + try: + conn = database._get_connection() + cur = conn.cursor() + # Parent IDs in memory (these tables are far smaller than tracks). + cur.execute("SELECT * FROM albums") + album_map = {str(r['id']): dict(r) for r in cur.fetchall()} + cur.execute("SELECT * FROM artists") + artist_map = {str(r['id']): dict(r) for r in cur.fetchall()} + # Track IDs only first (light); rows are pulled per page below so + # memory stays bounded on large libraries. Each page's SELECT is + # fully fetched before any UPDATE, so reusing one cursor is safe. + cur.execute("SELECT id FROM tracks WHERE file_path IS NOT NULL AND TRIM(file_path) != ''") + track_ids = [str(r['id']) for r in cur.fetchall()] + + with _reconcile_ids_lock: + _reconcile_ids_state['total'] = len(track_ids) + + PAGE = 500 + for start in range(0, len(track_ids), PAGE): + page = track_ids[start:start + PAGE] + ph = ','.join('?' * len(page)) + cur.execute(f"SELECT * FROM tracks WHERE id IN ({ph})", page) + rows = [dict(r) for r in cur.fetchall()] + + for tr in rows: + title = tr.get('title') or '?' + with _reconcile_ids_lock: + _reconcile_ids_state['current'] = title + + # One bad file must never abort the whole library scan. + try: + resolved = _resolve_library_file_path(tr.get('file_path')) + info = read_embedded_tags(resolved) if resolved else {'available': False} + tags = info.get('tags') if info.get('available') else None + + result = reconcile_track_row(cur, tr, album_map, artist_map, tags) + + with _reconcile_ids_lock: + if not result.readable: + _reconcile_ids_state['unreadable'] += 1 + else: + _reconcile_ids_state['entities_updated'] += result.applied.rows_updated + _reconcile_ids_state['ids_filled'] += result.applied.ids_filled + _reconcile_ids_state['conflicts'] += result.conflicts + except Exception as _te: + logger.debug("reconcile: skipped track %s: %s", tr.get('id'), _te) + with _reconcile_ids_lock: + _reconcile_ids_state['unreadable'] += 1 + finally: + with _reconcile_ids_lock: + _reconcile_ids_state['processed'] += 1 + + # Commit per page — releases the write lock so concurrent + # enrichment workers aren't starved during a long scan. + conn.commit() + except Exception as e: + logger.error(f"Reconcile embedded IDs background error: {e}") + finally: + if conn is not None: + try: + conn.close() + except Exception: + pass + with _reconcile_ids_lock: + _reconcile_ids_state['status'] = 'done' + _reconcile_ids_state['current'] = '' + + thread = threading.Thread(target=_run, daemon=True, name="ReconcileEmbeddedIds") + thread.start() + return jsonify({"success": True, "message": "Reconcile started"}) + + except Exception as e: + logger.error(f"Reconcile embedded IDs kickoff error: {e}") + with _reconcile_ids_lock: + _reconcile_ids_state['status'] = 'idle' + return jsonify({"success": False, "error": str(e)}), 500 + + +@app.route('/api/library/reconcile-embedded-ids/status', methods=['GET']) +def get_reconcile_embedded_ids_status(): + """Poll the status of the embedded-ID reconcile job.""" + with _reconcile_ids_lock: + return jsonify(dict(_reconcile_ids_state)) + # ── ReplayGain Analysis endpoints ── diff --git a/webui/index.html b/webui/index.html index 99b951cc..a7a68330 100644 --- a/webui/index.html +++ b/webui/index.html @@ -6591,6 +6591,43 @@ +
Read provider IDs (Spotify, MusicBrainz, iTunes, Deezer…) already embedded in your files and fill them into the database — lets enrichment workers skip redundant API lookups. Only fills blanks; never overwrites an existing match.
+Ready to scan
+ +0 / 0 files scanned (0.0%)
+