mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
60 lines
2.2 KiB
60 lines
2.2 KiB
#!/usr/bin/env python3
|
|
"""One-off repair for source ids that enrichment wrongly shared across multiple
|
|
artists (the Kendrick/Jorja corruption — one Deezer/AudioDB/Qobuz/Tidal id
|
|
stamped onto several unrelated artists).
|
|
|
|
Dry-run by default — shows exactly what it would clear and writes nothing.
|
|
|
|
Usage:
|
|
python scripts/dedupe_source_ids.py # dry-run (review first)
|
|
python scripts/dedupe_source_ids.py --apply # actually clear them
|
|
|
|
After --apply, run metadata enrichment so the (now name-checked) workers
|
|
re-derive each artist's id correctly. Stop the app first so the DB isn't locked.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import sys
|
|
|
|
# Allow running directly (`python scripts/dedupe_source_ids.py`) — put the repo
|
|
# root on the path so `core` / `database` import.
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from core.maintenance.dedupe_source_ids import clear_corrupt_source_ids # noqa: E402
|
|
from database.music_database import MusicDatabase # noqa: E402
|
|
|
|
if not logging.getLogger().handlers:
|
|
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
logger = logging.getLogger("dedupe_source_ids")
|
|
|
|
|
|
def main():
|
|
apply = "--apply" in sys.argv[1:]
|
|
db = MusicDatabase()
|
|
report = clear_corrupt_source_ids(db, dry_run=not apply)
|
|
|
|
mode = "APPLYING" if apply else "DRY-RUN (no changes written)"
|
|
logger.info(f"=== Source-id corruption repair — {mode} ===")
|
|
logger.info(
|
|
f"Corrupt clusters: {report['cluster_count']} | "
|
|
f"artists affected: {report['artist_count']}"
|
|
)
|
|
if report['by_source']:
|
|
logger.info("By source: " + ", ".join(
|
|
f"{s}={n}" for s, n in sorted(report['by_source'].items())
|
|
))
|
|
for c in report['clusters']:
|
|
logger.info(f" [{c['source']}] id {c['source_id']} -> {', '.join(c['artists'])}")
|
|
|
|
if not report['cluster_count']:
|
|
logger.info("Nothing to clean — no shared source ids across differently-named artists.")
|
|
elif apply:
|
|
logger.info("Cleared. Now run metadata enrichment to re-derive these ids correctly.")
|
|
else:
|
|
logger.info("Re-run with --apply to clear these (then run enrichment to re-derive).")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|