Rebuild artist SoulID: track-verified canonical ID from Deezer + iTunes

New algorithm: pick first track alphabetically from artist's library, search
both Deezer and iTunes APIs for 'artist track' to find the exact artist,
verify name matches, then use max(deezer_id, itunes_id) as the canonical
differentiator. Deterministic across instances — any SoulSync with the same
artist and at least one matching track will produce the same soul_id.

Fallback chain: canonical API ID → first album title → name only.
Migration clears all artist soul_ids on first v2.1 startup for regeneration.
pull/253/head
Broque Thomas 2 weeks ago
parent dc6f0db916
commit e8a5e253f6

@ -205,7 +205,7 @@ class SoulIDWorker:
# ── Artist processing (one at a time, API-based) ──
def _process_next_artist(self) -> int:
"""Process a single artist — queries iTunes + Deezer for debut year."""
"""Process a single artist — uses track-verified API lookup for canonical ID."""
conn = None
try:
conn = self.db._get_connection()
@ -224,21 +224,36 @@ class SoulIDWorker:
artist_id, name = row
self.current_item = f"Artist: {name}"
# Get this artist's album names from our DB for cross-referencing
# Get a track title from this artist for verification lookup
cursor.execute("""
SELECT title FROM albums
SELECT title FROM tracks
WHERE artist_id = ? AND title IS NOT NULL AND title != ''
ORDER BY title ASC
LIMIT 1
""", (artist_id,))
db_album_names = [r[0] for r in cursor.fetchall()]
track_row = cursor.fetchone()
verify_track = track_row[0] if track_row else None
# Look up debut year from iTunes + Deezer APIs
debut_year = self._lookup_debut_year(name, db_album_names)
# Look up canonical artist ID from Deezer + iTunes using track verification
canonical_id = self._lookup_canonical_artist_id(name, verify_track)
if debut_year:
soul_id = generate_soul_id(name, debut_year)
self.current_item = f"Artist: {name} ({debut_year})"
if canonical_id:
soul_id = generate_soul_id(name, str(canonical_id))
self.current_item = f"Artist: {name} (id:{canonical_id})"
else:
soul_id = generate_soul_id(name)
# Fallback: use name + first album title alphabetically
cursor.execute("""
SELECT title FROM albums
WHERE artist_id = ? AND title IS NOT NULL AND title != ''
ORDER BY title ASC
LIMIT 1
""", (artist_id,))
album_row = cursor.fetchone()
if album_row:
soul_id = generate_soul_id(name, album_row[0])
self.current_item = f"Artist: {name} (album fallback)"
else:
soul_id = generate_soul_id(name)
if not soul_id:
soul_id = f'soul_unnamed_{artist_id}'
@ -249,7 +264,7 @@ class SoulIDWorker:
)
conn.commit()
self.stats['artists_processed'] += 1
logger.info(f"Generated soul ID for artist: {name}" + (f" (debut {debut_year})" if debut_year else ""))
logger.info(f"Generated soul ID for artist: {name}" + (f" (canonical id: {canonical_id})" if canonical_id else ""))
# Rate limit courtesy for API calls
time.sleep(self.artist_sleep)
@ -268,6 +283,82 @@ class SoulIDWorker:
if conn:
conn.close()
def _lookup_canonical_artist_id(self, artist_name: str, verify_track: Optional[str]) -> Optional[int]:
"""Look up a canonical artist ID from Deezer and iTunes using track verification.
Searches both services for 'artist_name track_title' to find the exact artist,
then returns max(deezer_id, itunes_id) as a deterministic canonical identifier.
Any SoulSync instance with the same artist and at least one matching track
will arrive at the same canonical ID.
Args:
artist_name: Artist name to search for
verify_track: A track title from the artist's library for verification
Returns:
max(deezer_id, itunes_id) as int, or the single available ID, or None
"""
if not verify_track:
return None
matching = self._get_matching_engine()
norm_artist = matching.normalize_string(artist_name) if matching else artist_name.lower().strip()
deezer_artist_id = None
itunes_artist_id = None
# Search Deezer by "artist track" to find the exact artist
deezer = self._get_deezer_client()
if deezer:
try:
import requests as req
query = f"{artist_name} {verify_track}"
resp = req.get('https://api.deezer.com/search', params={'q': query, 'limit': 5}, timeout=10)
if resp.ok:
for item in resp.json().get('data', []):
result_artist = item.get('artist', {}).get('name', '')
norm_result = matching.normalize_string(result_artist) if matching else result_artist.lower().strip()
if norm_result == norm_artist or (matching and matching.similarity_score(norm_artist, norm_result) >= 0.85):
raw_id = item.get('artist', {}).get('id')
if raw_id:
deezer_artist_id = int(raw_id)
logger.debug(f"Deezer artist ID for '{artist_name}': {deezer_artist_id}")
break
time.sleep(0.3)
except Exception as e:
logger.debug(f"Deezer track search failed for '{artist_name}': {e}")
# Search iTunes by "artist track" to find the exact artist
itunes = self._get_itunes_client()
if itunes:
try:
query = f"{artist_name} {verify_track}"
raw_results = itunes._search(query, entity='song', limit=5)
if raw_results:
for item in raw_results:
result_artist = item.get('artistName', '')
norm_result = matching.normalize_string(result_artist) if matching else result_artist.lower().strip()
if norm_result == norm_artist or (matching and matching.similarity_score(norm_artist, norm_result) >= 0.85):
raw_id = item.get('artistId')
if raw_id:
itunes_artist_id = int(raw_id)
logger.debug(f"iTunes artist ID for '{artist_name}': {itunes_artist_id}")
break
time.sleep(0.3)
except Exception as e:
logger.debug(f"iTunes track search failed for '{artist_name}': {e}")
# Return max of both IDs (deterministic regardless of which source each instance has)
if deezer_artist_id and itunes_artist_id:
canonical = max(deezer_artist_id, itunes_artist_id)
logger.debug(f"Canonical ID for '{artist_name}': {canonical} (deezer={deezer_artist_id}, itunes={itunes_artist_id})")
return canonical
elif deezer_artist_id:
return deezer_artist_id
elif itunes_artist_id:
return itunes_artist_id
return None
def _lookup_debut_year(self, artist_name: str, db_album_names: List[str]) -> Optional[str]:
"""Look up an artist's debut year from iTunes and Deezer.

@ -2660,6 +2660,16 @@ class MusicDatabase:
cursor.execute("CREATE INDEX IF NOT EXISTS idx_tracks_soul_id ON tracks (soul_id)")
cursor.execute("CREATE INDEX IF NOT EXISTS idx_tracks_album_soul_id ON tracks (album_soul_id)")
# v2.1 migration: regenerate artist soul_ids with new canonical ID algorithm
# (was name+debut_year, now name+max(deezer_id,itunes_id) via track-verified lookup)
cursor.execute("SELECT value FROM metadata WHERE key = 'soulid_v2_migration'")
if not cursor.fetchone():
cursor.execute("UPDATE artists SET soul_id = NULL")
cleared = cursor.rowcount
cursor.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES ('soulid_v2_migration', '1')")
if cleared > 0:
logger.info(f"🔄 SoulID v2 migration: cleared {cleared} artist soul_ids for regeneration")
except Exception as e:
logger.error(f"Error adding soul_id columns: {e}")

Loading…
Cancel
Save