You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/core/soulid_worker.py

671 lines
25 KiB

"""
SoulID Worker — generates deterministic soul IDs for artists, albums, and tracks.
Runs as a background worker that processes library entries without soul IDs,
computes a deterministic hash from normalized metadata, and stores the result.
Hash inputs (all lowercased, stripped of accents/punctuation, collapsed):
- Artist: normalize(artist_name) + normalize(debut_year) if known
Debut year is sourced from iTunes + Deezer APIs (not local DB)
to ensure deterministic results across all SoulSync nodes.
- Album: normalize(artist_name) + normalize(album_name)
- Track (song): normalize(artist_name) + normalize(track_name)
- Track (album): normalize(artist_name) + normalize(album_name) + normalize(track_name)
The "song" soul ID links different versions of the same song (single vs album).
The "album track" soul ID is specific to a track on a particular release.
"""
import hashlib
import re
import threading
import time
import unicodedata
from typing import Dict, Any, List, Optional
from utils.logging_config import get_logger
logger = get_logger("soulid_worker")
def normalize_for_soul_id(text: str) -> str:
"""Aggressively normalize a string for deterministic hashing.
- Lowercase
- Strip accents/diacritics (Beyoncé → beyonce)
- Remove parentheticals: (feat. X), (Deluxe), (Remastered), [Live], etc.
- Remove all non-alphanumeric characters
- Collapse whitespace
"""
if not text:
return ''
s = text.lower()
# Decompose unicode and strip combining marks (accents)
s = unicodedata.normalize('NFD', s)
s = ''.join(c for c in s if unicodedata.category(c) != 'Mn')
# Remove parenthetical/bracket suffixes
s = re.sub(r'\s*[\(\[][^)\]]*[\)\]]', '', s)
# Remove all non-alphanumeric
s = re.sub(r'[^a-z0-9]', '', s)
return s
def generate_soul_id(*parts: str) -> str:
"""Generate a soul ID from normalized parts.
Returns a 'soul_' prefixed hex string (first 16 chars of SHA-256).
"""
combined = ''.join(normalize_for_soul_id(p) for p in parts if p)
if not combined:
return ''
digest = hashlib.sha256(combined.encode('utf-8')).hexdigest()[:16]
return f'soul_{digest}'
class SoulIDWorker:
"""Background worker that generates soul IDs for all library entities.
Artists are processed one at a time (API calls to iTunes/Deezer for
deterministic debut year). Albums and tracks are processed in batches
(local DB only, no API calls).
"""
def __init__(self, database):
self.db = database
# Worker state
self.running = False
self.paused = False
self.should_stop = False
self.thread = None
self.current_item = None
# API clients (lazy-initialized)
self._itunes_client = None
self._deezer_client = None
self._matching_engine = None
# Statistics
self.stats = {
'artists_processed': 0,
'albums_processed': 0,
'tracks_processed': 0,
'errors': 0,
'pending': 0,
}
# Processing config
self.batch_size = 100 # For albums/tracks (no API calls)
self.artist_sleep = 3.0 # Between artist API lookups (rate limit courtesy)
self.inter_batch_sleep = 0.5 # Between album/track batches
self.idle_sleep = 30
self.album_match_threshold = 0.80
logger.info("SoulID worker initialized")
def _get_itunes_client(self):
if self._itunes_client is None:
try:
from core.itunes_client import iTunesClient
self._itunes_client = iTunesClient()
except Exception as e:
logger.error(f"Failed to init iTunes client: {e}")
return self._itunes_client
def _get_deezer_client(self):
if self._deezer_client is None:
try:
from core.deezer_client import DeezerClient
self._deezer_client = DeezerClient()
except Exception as e:
logger.error(f"Failed to init Deezer client: {e}")
return self._deezer_client
def _get_matching_engine(self):
if self._matching_engine is None:
try:
from core.matching_engine import MusicMatchingEngine
self._matching_engine = MusicMatchingEngine()
except Exception as e:
logger.error(f"Failed to init matching engine: {e}")
return self._matching_engine
def start(self):
if self.running:
return
self.running = True
self.should_stop = False
self.thread = threading.Thread(target=self._run, daemon=True)
self.thread.start()
logger.info("SoulID worker started")
def stop(self):
if not self.running:
return
self.should_stop = True
self.running = False
if self.thread:
self.thread.join(timeout=5)
logger.info("SoulID worker stopped")
def pause(self):
self.paused = True
def resume(self):
self.paused = False
def get_stats(self) -> Dict[str, Any]:
self.stats['pending'] = self._count_pending()
is_running = self.running and self.thread is not None and self.thread.is_alive()
is_idle = is_running and not self.paused and self.stats['pending'] == 0
return {
'enabled': True,
'running': is_running and not self.paused,
'paused': self.paused,
'idle': is_idle,
'current_item': self.current_item,
'stats': self.stats.copy(),
}
# ── Main loop ──
def _run(self):
logger.info("SoulID worker thread started")
# One-time migration: reset artist soul IDs when algorithm changes
self._migrate_artist_soul_ids()
while not self.should_stop:
try:
if self.paused:
time.sleep(1)
continue
processed = 0
processed += self._process_next_artist()
processed += self._process_albums()
processed += self._process_tracks()
if processed == 0:
self.current_item = None
time.sleep(self.idle_sleep)
else:
# Albums/tracks get inter_batch_sleep, artists get their
# own sleep inside _process_next_artist
time.sleep(self.inter_batch_sleep)
except Exception as e:
logger.error(f"Error in SoulID worker loop: {e}", exc_info=True)
self.stats['errors'] += 1
time.sleep(5)
self.current_item = None
logger.info("SoulID worker thread finished")
# ── Artist processing (one at a time, API-based) ──
def _process_next_artist(self) -> int:
"""Process a single artist — uses track-verified API lookup for canonical ID."""
conn = None
try:
conn = self.db._get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT id, name FROM artists
WHERE (soul_id IS NULL OR soul_id = '')
AND name IS NOT NULL AND name != ''
AND id IS NOT NULL
LIMIT 1
""")
row = cursor.fetchone()
if not row:
return 0
artist_id, name = row
self.current_item = f"Artist: {name}"
# Get a track title from this artist for verification lookup
cursor.execute("""
SELECT title FROM tracks
WHERE artist_id = ? AND title IS NOT NULL AND title != ''
ORDER BY title ASC
LIMIT 1
""", (artist_id,))
track_row = cursor.fetchone()
verify_track = track_row[0] if track_row else None
# Look up canonical artist ID from Deezer + iTunes using track verification
canonical_id = self._lookup_canonical_artist_id(name, verify_track)
if canonical_id:
soul_id = generate_soul_id(name, str(canonical_id))
self.current_item = f"Artist: {name} (id:{canonical_id})"
else:
# Fallback: use name + first album title alphabetically
cursor.execute("""
SELECT title FROM albums
WHERE artist_id = ? AND title IS NOT NULL AND title != ''
ORDER BY title ASC
LIMIT 1
""", (artist_id,))
album_row = cursor.fetchone()
if album_row:
soul_id = generate_soul_id(name, album_row[0])
self.current_item = f"Artist: {name} (album fallback)"
else:
soul_id = generate_soul_id(name)
if not soul_id:
soul_id = f'soul_unnamed_{artist_id}'
cursor.execute(
"UPDATE artists SET soul_id = ? WHERE id = ? AND (soul_id IS NULL OR soul_id = '')",
(soul_id, artist_id)
)
conn.commit()
self.stats['artists_processed'] += 1
logger.info(f"Generated soul ID for artist: {name}" + (f" (canonical id: {canonical_id})" if canonical_id else ""))
# Rate limit courtesy for API calls
time.sleep(self.artist_sleep)
return 1
except Exception as e:
logger.error(f"Error processing artist: {e}")
self.stats['errors'] += 1
if conn:
try:
conn.rollback()
except Exception:
pass
return 0
finally:
if conn:
conn.close()
def _lookup_canonical_artist_id(self, artist_name: str, verify_track: Optional[str]) -> Optional[int]:
"""Look up a canonical artist ID from Deezer and iTunes using track verification.
Searches both services for 'artist_name track_title' to find the exact artist,
then returns max(deezer_id, itunes_id) as a deterministic canonical identifier.
Any SoulSync instance with the same artist and at least one matching track
will arrive at the same canonical ID.
Args:
artist_name: Artist name to search for
verify_track: A track title from the artist's library for verification
Returns:
max(deezer_id, itunes_id) as int, or the single available ID, or None
"""
if not verify_track:
return None
matching = self._get_matching_engine()
norm_artist = matching.normalize_string(artist_name) if matching else artist_name.lower().strip()
deezer_artist_id = None
itunes_artist_id = None
# Search Deezer by "artist track" to find the exact artist
deezer = self._get_deezer_client()
if deezer:
try:
import requests as req
query = f"{artist_name} {verify_track}"
resp = req.get('https://api.deezer.com/search', params={'q': query, 'limit': 5}, timeout=10)
if resp.ok:
for item in resp.json().get('data', []):
result_artist = item.get('artist', {}).get('name', '')
norm_result = matching.normalize_string(result_artist) if matching else result_artist.lower().strip()
if norm_result == norm_artist or (matching and matching.similarity_score(norm_artist, norm_result) >= 0.85):
raw_id = item.get('artist', {}).get('id')
if raw_id:
deezer_artist_id = int(raw_id)
logger.debug(f"Deezer artist ID for '{artist_name}': {deezer_artist_id}")
break
time.sleep(0.3)
except Exception as e:
logger.debug(f"Deezer track search failed for '{artist_name}': {e}")
# Search iTunes by "artist track" to find the exact artist
itunes = self._get_itunes_client()
if itunes:
try:
query = f"{artist_name} {verify_track}"
raw_results = itunes._search(query, entity='song', limit=5)
if raw_results:
for item in raw_results:
result_artist = item.get('artistName', '')
norm_result = matching.normalize_string(result_artist) if matching else result_artist.lower().strip()
if norm_result == norm_artist or (matching and matching.similarity_score(norm_artist, norm_result) >= 0.85):
raw_id = item.get('artistId')
if raw_id:
itunes_artist_id = int(raw_id)
logger.debug(f"iTunes artist ID for '{artist_name}': {itunes_artist_id}")
break
time.sleep(0.3)
except Exception as e:
logger.debug(f"iTunes track search failed for '{artist_name}': {e}")
# Return max of both IDs (deterministic regardless of which source each instance has)
if deezer_artist_id and itunes_artist_id:
canonical = max(deezer_artist_id, itunes_artist_id)
logger.debug(f"Canonical ID for '{artist_name}': {canonical} (deezer={deezer_artist_id}, itunes={itunes_artist_id})")
return canonical
elif deezer_artist_id:
return deezer_artist_id
elif itunes_artist_id:
return itunes_artist_id
return None
def _lookup_debut_year(self, artist_name: str, db_album_names: List[str]) -> Optional[str]:
"""Look up an artist's debut year from iTunes and Deezer.
Searches both sources for the artist, verifies the match by comparing
their discography against our DB albums, then pools all album years
from both matched sources and returns the earliest.
Args:
artist_name: Artist name to search for
db_album_names: Album names from our DB for this artist
Returns:
Earliest release year as string (e.g. '2011'), or None
"""
if not db_album_names:
# No albums to cross-reference — can't verify which artist is correct
return None
matching = self._get_matching_engine()
if not matching:
return None
# Search both sources
itunes = self._get_itunes_client()
deezer = self._get_deezer_client()
itunes_results = []
deezer_results = []
try:
if itunes:
itunes_results = itunes.search_artists(artist_name, limit=5) or []
except Exception as e:
logger.debug(f"iTunes artist search failed for '{artist_name}': {e}")
try:
if deezer:
deezer_results = deezer.search_artists(artist_name, limit=5) or []
except Exception as e:
logger.debug(f"Deezer artist search failed for '{artist_name}': {e}")
# Each source independently steps through its results to find a match
itunes_discog = self._find_matching_discography(itunes, itunes_results, db_album_names, matching, 'iTunes')
deezer_discog = self._find_matching_discography(deezer, deezer_results, db_album_names, matching, 'Deezer')
# Pool all albums from both matched sources
all_years = []
for discog in (itunes_discog, deezer_discog):
if discog:
for album in discog:
year = self._extract_year(album)
if year:
all_years.append(year)
if all_years:
return min(all_years)
return None
def _find_matching_discography(self, client, artist_results, db_album_names: List[str],
matching, source_name: str) -> Optional[list]:
"""Step through artist search results, return the discography of the first
one whose albums overlap with our DB albums.
Args:
client: iTunes or Deezer client
artist_results: List of Artist dataclass objects from search
db_album_names: Our DB album names for comparison
matching: MatchingEngine instance
source_name: 'iTunes' or 'Deezer' for logging
Returns:
List of Album objects from the matched artist's discography, or None
"""
if not client or not artist_results:
return None
for artist in artist_results:
try:
discog = client.get_artist_albums(artist.id, album_type='album,single', limit=50)
if not discog:
continue
# Check if any discography album matches any DB album
for api_album in discog:
api_name = api_album.name if hasattr(api_album, 'name') else str(api_album)
for db_name in db_album_names:
score = matching.similarity_score(
matching.normalize_string(api_name),
matching.normalize_string(db_name)
)
if score >= self.album_match_threshold:
logger.debug(f" {source_name}: matched '{artist.name}' via album '{api_name}''{db_name}' (score={score:.2f})")
return discog
except Exception as e:
logger.debug(f" {source_name}: discography fetch failed for '{artist.name}': {e}")
continue
return None
@staticmethod
def _extract_year(album) -> Optional[str]:
"""Extract a 4-digit year from an Album object's release_date."""
release_date = getattr(album, 'release_date', '') or ''
release_date = str(release_date)
if len(release_date) >= 4:
year = release_date[:4]
if year.isdigit() and int(year) > 1900:
return year
return None
# ── Album processing (batch, local DB only) ──
def _process_albums(self) -> int:
conn = None
try:
conn = self.db._get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT al.id, al.title, ar.name as artist_name
FROM albums al
JOIN artists ar ON ar.id = al.artist_id
WHERE (al.soul_id IS NULL OR al.soul_id = '')
AND al.title IS NOT NULL AND al.title != ''
AND ar.name IS NOT NULL AND ar.name != ''
AND al.id IS NOT NULL
LIMIT ?
""", (self.batch_size,))
rows = cursor.fetchall()
if not rows:
return 0
count = 0
for album_id, title, artist_name in rows:
if self.should_stop:
break
soul_id = generate_soul_id(artist_name, title)
if not soul_id:
soul_id = f'soul_unnamed_{album_id}'
cursor.execute(
"UPDATE albums SET soul_id = ? WHERE id = ? AND (soul_id IS NULL OR soul_id = '')",
(soul_id, album_id)
)
count += 1
self.current_item = f"Album: {artist_name} - {title}"
if count > 0:
conn.commit()
self.stats['albums_processed'] += count
logger.info(f"Generated soul IDs for {count} albums")
return count
except Exception as e:
logger.error(f"Error processing albums: {e}")
self.stats['errors'] += 1
if conn:
try:
conn.rollback()
except Exception:
pass
return 0
finally:
if conn:
conn.close()
# ── Track processing (batch, local DB only) ──
def _process_tracks(self) -> int:
conn = None
try:
conn = self.db._get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT t.id, t.title, ar.name as artist_name, al.title as album_title
FROM tracks t
JOIN artists ar ON ar.id = t.artist_id
LEFT JOIN albums al ON al.id = t.album_id
WHERE (t.soul_id IS NULL OR t.soul_id = '')
AND t.title IS NOT NULL AND t.title != ''
AND ar.name IS NOT NULL AND ar.name != ''
AND t.id IS NOT NULL
LIMIT ?
""", (self.batch_size,))
rows = cursor.fetchall()
if not rows:
return 0
count = 0
for track_id, title, artist_name, album_title in rows:
if self.should_stop:
break
# Song soul ID: artist + track (links singles to album versions)
song_soul_id = generate_soul_id(artist_name, title)
# Album track soul ID: artist + album + track (specific to this release)
album_soul_id = ''
if album_title:
album_soul_id = generate_soul_id(artist_name, album_title, title)
if not song_soul_id:
song_soul_id = f'soul_unnamed_{track_id}'
cursor.execute(
"UPDATE tracks SET soul_id = ?, album_soul_id = ? WHERE id = ? AND (soul_id IS NULL OR soul_id = '')",
(song_soul_id, album_soul_id or None, track_id)
)
count += 1
self.current_item = f"Track: {artist_name} - {title}"
if count > 0:
conn.commit()
self.stats['tracks_processed'] += count
logger.info(f"Generated soul IDs for {count} tracks")
return count
except Exception as e:
logger.error(f"Error processing tracks: {e}")
self.stats['errors'] += 1
if conn:
try:
conn.rollback()
except Exception:
pass
return 0
finally:
if conn:
conn.close()
# ── Migrations ──
def _migrate_artist_soul_ids(self):
"""One-time reset: clear all artist soul IDs when algorithm changes.
Uses a versioned metadata flag to run only once per algorithm version."""
conn = None
try:
conn = self.db._get_connection()
cursor = conn.cursor()
cursor.execute("SELECT value FROM metadata WHERE key = 'soulid_artist_version' LIMIT 1")
row = cursor.fetchone()
if row and row[0] == 'debut_year_api_v2':
return # Already on latest version
# Reset all artist soul IDs for regeneration
cursor.execute("UPDATE artists SET soul_id = NULL WHERE soul_id IS NOT NULL")
reset_count = cursor.rowcount
# Mark current version
cursor.execute("INSERT OR REPLACE INTO metadata (key, value) VALUES ('soulid_artist_version', 'debut_year_api_v2')")
# Clean up old flags
cursor.execute("DELETE FROM metadata WHERE key IN ('soulid_artist_v2', 'soulid_artist_v3')")
conn.commit()
if reset_count > 0:
logger.info(f"SoulID migration: reset {reset_count} artist soul IDs for API-based debut year regeneration")
except Exception as e:
logger.error(f"SoulID migration failed: {e}")
if conn:
try:
conn.rollback()
except Exception:
pass
finally:
if conn:
conn.close()
# ── Helpers ──
def _count_pending(self) -> int:
conn = None
try:
conn = self.db._get_connection()
cursor = conn.cursor()
total = 0
# Must match the WHERE clauses in _process_* methods exactly
cursor.execute("""
SELECT COUNT(*) FROM artists
WHERE (soul_id IS NULL OR soul_id = '')
AND name IS NOT NULL AND name != ''
AND id IS NOT NULL
""")
total += (cursor.fetchone() or [0])[0]
cursor.execute("""
SELECT COUNT(*) FROM albums al
JOIN artists ar ON ar.id = al.artist_id
WHERE (al.soul_id IS NULL OR al.soul_id = '')
AND al.title IS NOT NULL AND al.title != ''
AND ar.name IS NOT NULL AND ar.name != ''
AND al.id IS NOT NULL
""")
total += (cursor.fetchone() or [0])[0]
cursor.execute("""
SELECT COUNT(*) FROM tracks t
JOIN artists ar ON ar.id = t.artist_id
WHERE (t.soul_id IS NULL OR t.soul_id = '')
AND t.title IS NOT NULL AND t.title != ''
AND ar.name IS NOT NULL AND ar.name != ''
AND t.id IS NOT NULL
""")
total += (cursor.fetchone() or [0])[0]
return total
except Exception:
return 0
finally:
if conn:
conn.close()