SoulSync/scripts/dedupe_source_ids.py

#!/usr/bin/env python3
"""One-off repair for source ids that enrichment wrongly shared across multiple
artists (the Kendrick/Jorja corruption — one Deezer/AudioDB/Qobuz/Tidal id
stamped onto several unrelated artists).

Dry-run by default — shows exactly what it would clear and writes nothing.

Usage:
    python scripts/dedupe_source_ids.py            # dry-run (review first)
    python scripts/dedupe_source_ids.py --apply    # actually clear them

After --apply, run metadata enrichment so the (now name-checked) workers
re-derive each artist's id correctly. Stop the app first so the DB isn't locked.
"""

import logging
import os
import sys

# Allow running directly (`python scripts/dedupe_source_ids.py`) — put the repo
# root on the path so `core` / `database` import.
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from core.maintenance.dedupe_source_ids import clear_corrupt_source_ids  # noqa: E402
from database.music_database import MusicDatabase  # noqa: E402

if not logging.getLogger().handlers:
    logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger("dedupe_source_ids")


def main():
    apply = "--apply" in sys.argv[1:]
    db = MusicDatabase()
    report = clear_corrupt_source_ids(db, dry_run=not apply)

    mode = "APPLYING" if apply else "DRY-RUN (no changes written)"
    logger.info(f"=== Source-id corruption repair — {mode} ===")
    logger.info(
        f"Corrupt clusters: {report['cluster_count']}  |  "
        f"artists affected: {report['artist_count']}"
    )
    if report['by_source']:
        logger.info("By source: " + ", ".join(
            f"{s}={n}" for s, n in sorted(report['by_source'].items())
        ))
    for c in report['clusters']:
        logger.info(f"  [{c['source']}] id {c['source_id']} -> {', '.join(c['artists'])}")

    if not report['cluster_count']:
        logger.info("Nothing to clean — no shared source ids across differently-named artists.")
    elif apply:
        logger.info("Cleared. Now run metadata enrichment to re-derive these ids correctly.")
    else:
        logger.info("Re-run with --apply to clear these (then run enrichment to re-derive).")


if __name__ == "__main__":
    main()