You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SoulSync/scripts/dedupe_source_ids.py

60 lines
2.2 KiB

#!/usr/bin/env python3
"""One-off repair for source ids that enrichment wrongly shared across multiple
artists (the Kendrick/Jorja corruption — one Deezer/AudioDB/Qobuz/Tidal id
stamped onto several unrelated artists).
Dry-run by default — shows exactly what it would clear and writes nothing.
Usage:
python scripts/dedupe_source_ids.py # dry-run (review first)
python scripts/dedupe_source_ids.py --apply # actually clear them
After --apply, run metadata enrichment so the (now name-checked) workers
re-derive each artist's id correctly. Stop the app first so the DB isn't locked.
"""
import logging
import os
import sys
# Allow running directly (`python scripts/dedupe_source_ids.py`) — put the repo
# root on the path so `core` / `database` import.
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from core.maintenance.dedupe_source_ids import clear_corrupt_source_ids # noqa: E402
from database.music_database import MusicDatabase # noqa: E402
if not logging.getLogger().handlers:
logging.basicConfig(level=logging.INFO, format="%(message)s")
logger = logging.getLogger("dedupe_source_ids")
def main():
apply = "--apply" in sys.argv[1:]
db = MusicDatabase()
report = clear_corrupt_source_ids(db, dry_run=not apply)
mode = "APPLYING" if apply else "DRY-RUN (no changes written)"
logger.info(f"=== Source-id corruption repair — {mode} ===")
logger.info(
f"Corrupt clusters: {report['cluster_count']} | "
f"artists affected: {report['artist_count']}"
)
if report['by_source']:
logger.info("By source: " + ", ".join(
f"{s}={n}" for s, n in sorted(report['by_source'].items())
))
for c in report['clusters']:
logger.info(f" [{c['source']}] id {c['source_id']} -> {', '.join(c['artists'])}")
if not report['cluster_count']:
logger.info("Nothing to clean — no shared source ids across differently-named artists.")
elif apply:
logger.info("Cleared. Now run metadata enrichment to re-derive these ids correctly.")
else:
logger.info("Re-run with --apply to clear these (then run enrichment to re-derive).")
if __name__ == "__main__":
main()