mirror of https://github.com/Nezreka/SoulSync.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
82 lines
3.8 KiB
82 lines
3.8 KiB
"""Guard against char-level title false positives in track matching.
|
|
|
|
Issue #769: playlist sync matched tracks that aren't in the library to a
|
|
DIFFERENT song by the SAME artist, with high confidence — e.g. "Dani
|
|
California" -> "Californication" (Red Hot Chili Peppers), "Under The Bridge"
|
|
-> "Around the World". The confidence formula is ``0.5*title + 0.5*artist``,
|
|
and a same-artist comparison always yields ``artist = 1.0``, so the title score
|
|
is the only thing that can tell two of an artist's songs apart. But the title
|
|
score is a ``difflib.SequenceMatcher`` character ratio, which over-credits
|
|
unrelated titles that happen to share a long substring ("californi…") or only a
|
|
stopword ("the"): 0.67 and 0.62 respectively. With the flat 0.5 artist term
|
|
that lands at 0.83 / 0.81 — well over the 0.7 sync threshold.
|
|
|
|
``titles_plausibly_same`` adds a cheap word-level sanity check on top of the
|
|
char ratio: accept a pair only when it's near-identical char-wise (so typos and
|
|
punctuation/casing variants — "Beleive"/"Believe", "HUMBLE."/"Humble" — still
|
|
match) OR the two titles share at least one significant (non-stopword) token.
|
|
Two genuinely different songs by the same artist share no content word, so they
|
|
get rejected; the real track is then correctly reported missing.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
# Articles / prepositions / conjunctions only. Deliberately NOT pronouns
|
|
# ("you", "me", "i") — those carry meaning in song titles and dropping them
|
|
# could strip the only shared word from a real match. "the" MUST stay here:
|
|
# without it "Under The Bridge" and "Around the World" would falsely share it.
|
|
_TITLE_STOPWORDS = frozenset({
|
|
"the", "a", "an", "of", "and", "or", "to", "in", "on",
|
|
"for", "with", "at", "by", "from",
|
|
})
|
|
|
|
_TOKEN_RE = re.compile(r"[a-z0-9]+")
|
|
|
|
# Char ratio at/above which two titles are treated as the same regardless of
|
|
# shared words — covers typos, punctuation, casing, accents. Tuned so single-
|
|
# word typos ("Beleive"/"Believe" = 0.857) pass while the #769 false positives
|
|
# ("Dani California"/"Californication" = 0.667) do not.
|
|
_NEAR_IDENTICAL = 0.85
|
|
|
|
|
|
def _content_tokens(text: str) -> set[str]:
|
|
return {t for t in _TOKEN_RE.findall((text or "").lower()) if t not in _TITLE_STOPWORDS}
|
|
|
|
|
|
def titles_plausibly_same(
|
|
title_a: str,
|
|
title_b: str,
|
|
char_similarity: float,
|
|
*,
|
|
near_identical: float = _NEAR_IDENTICAL,
|
|
) -> bool:
|
|
"""Whether two titles could be the same track, given their char similarity.
|
|
|
|
``title_a`` / ``title_b`` should already be normalised/cleaned (lowercased,
|
|
brackets stripped) the same way the caller computed ``char_similarity``.
|
|
|
|
Returns ``True`` when the pair is near-identical char-wise OR shares at
|
|
least one significant (non-stopword) token. Returns ``False`` for two
|
|
titles that are only moderately char-similar and share no content word —
|
|
i.e. different songs the char ratio over-credited (#769)."""
|
|
if char_similarity >= near_identical:
|
|
return True
|
|
ta = _content_tokens(title_a)
|
|
tb = _content_tokens(title_b)
|
|
# Word-overlap is only a reliable "different song" signal when at least one
|
|
# side has 2+ content words — that's the #769 case where the char ratio
|
|
# over-credits a shared substring ("Dani California"/"Californication") or
|
|
# a stopword ("Under The Bridge"/"Around the World"). For single-word
|
|
# titles there's no other word to share, so applying it would wrongly fail
|
|
# legitimate stylized spellings ("Grey"/"Gray", "Tonite"/"Tonight",
|
|
# "Thru"/"Through") that the char ratio rightly accepts. In that case defer
|
|
# to the caller's existing char-similarity floor instead of force-failing.
|
|
if max(len(ta), len(tb)) < 2 or not ta or not tb:
|
|
return True
|
|
return not ta.isdisjoint(tb)
|
|
|
|
|
|
__all__ = ["titles_plausibly_same"]
|