From e267986d44d9005dbfe16a2f8d4d27390ee5ae9d Mon Sep 17 00:00:00 2001 From: Broque Thomas Date: Fri, 8 Aug 2025 13:57:33 -0700 Subject: [PATCH] better matching engine --- .../matching_engine.cpython-312.pyc | Bin 29435 -> 29623 bytes core/matching_engine.py | 38 ++++++++++-------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/core/__pycache__/matching_engine.cpython-312.pyc b/core/__pycache__/matching_engine.cpython-312.pyc index d90c8cc655d22674487eae901240855b27ba58b1..51e780d318ee68ebeb1d9bf3b62d27a822c5ab09 100644 GIT binary patch delta 2613 zcmZ`*3rt(*6~5;hUohBUFyUb^U>h5pK!^#?mIRzYnluy%LxAuIFB@a?Fu6AA>Rv|L zscP45YOB*U>!vALH!1DtNYb0FNgtcHl8k9>^AnM+MUj?mtyXPmO9EOqwafPZHl)!u zu5`b1{&Uay?s@$FufO;RUi%9q{#K)j=jgT2`~2|LZ=Fq?6+nS+35EEjAVQq%kFDil zn|xX|2e4lbCfopMk^42D19Zy=lU`MTTUP6CD&U*)?X*!Ieu5*Jj|-pj9Cw_9M+gXbZu``r~ zT~o=D))mJbX@F`yKuUJSuj^A9SHe_^q>)l60XN74y{8lpayB(y%l4~0QJ#-$<^6^< zUbsM)0U15HpP01S-w9qxGG8|sPj;WFU$9gy7^k{t+bg1>H%D2Fqa!LLdkZua5 zMl;|zU;4Vnaae4%qrEx{AF_Hj^!c;3WYbsCSzkrVx`GP;2Kn8>GT=it)NJU4Qo7s&lXNv9ddKBkTP8sG z7bOooA`cf=g3WZI>4hdz+(z2nq-}@{jgj$($(XcErvc486-7>dmE#O8IJdQDdd<6& zba_dqkN6G~ahm#w;9#x}{y-$6EfUerzi-}pMoAldCA<8?jGPnd7u2M{5#CB1&7`@5 z92_DY9^wg-$hj0(!Mi{H(!uvI-HoP-7@8hBePlLz=-Ci?h6CR%wH6E5D( zgms9$JJTbdE3trWyKaA+a49y0p+1J<;Mz`$beBz7LGP6&mCW1ikkBE;N--mqw4N5+ zS!#hgJXvMHM@ow}X(Is?e7bdwj&-)QK(%RojYqRn@m6VxB1cNVZPlq*wS5a6x(>tT z1(3ydiu%c159f-15vDXj1I}%?8NV;7#qR}k*C{0R3`BL8I`Y%`HYS!?;24&bRe}-6 ztMc)gvQ5w(N$c&hjnY~!GiPNN!%5 zFES%+tS$x%p0(%{%mP*uzFPecK4g;5-Gcy6;NF@dK4izQ*Vy@x5iiuBo}Dt{>kQst z(9GaX25;fe4wL@pOzmLs3kG2Zzhv+$oZeZlnPX}Jg_jlLMO@iofnVeGoegjbdw1r- zJo7Q!@e$vnjsdo3^YV_0?2Hhj-Y=oGHdmOZhL8zcYEA5%n67Q&Lk9ePZ4T)0@3mI# zIp$x^-~@vgv7pWbKfwFz3X^}t473s4QcQm^D2g-qjXJY%npscbD|M?0mza8)LcoYn ztjDk;4&QY&3vG161|GxwU4P6!M!YdB**d49)y2|3tF3|))bZp_}je2@Nk z~JD!njnV5i&|0MS-%6S z)(((;^Vazc((EKHy~No^`ud4w;A+O;2gzA?qW8`l=95T!h!`GT)Jpi{h9ScmZ_W9( z^LxV%vZs?Ybdj!ZV(3}a9zxrm2~&Z}nIT-VTPeO{S2+#BJNH^CUd~cEO~PeMy3>Sj z?D-_+2JL}OA?jH_QW*RWU%#&nF3QP!<6~5w8_nl)9{bzbTFh&F3Ohk2vQj=pxT+S?cY>gN(>D8MYpS0bO$C!g8}igbtn47 zF9z>=qQF+CpXYhJ=sXFl@=y!?Mula^fky#WaIP(Q*WPPoB<&0m>BPh&bheve6_2z( z4SHGVILaHt>|rAXn;=H=N@I?z%-X{s6;E{ckiHoe8}r^j}N-V)F|hN zoe0hH*Sd;hgoq`KGrd?GX3>mT%wTQ-s@YY3)eo*3; z{$BVEKGi=uoy8&}j?oadTwuX*O#30zx+pBE+~bpDZm~b;j%04r+%+iHTv5gR?{qYp zMJ}_&Gf*%{ps)nqC1Gs<8hX+n85jYt{HK9`DnX0ahI46uzZ}koD`@oAfK@)^tphkF zpYUDgQ|_hVw%F)|OA-(?FfN-TOkI z>Yn89+ODSY(_nBF!T^&EZnbo`)i-Jz4FR|F97jIbzuMA#-3xtA7= zBvBiP1nEk(1pg|;Kr6RvYk62EKdXBSph@mZo&oU7O~!u%bjtN9r&JJ>|CM%A1y9PK zW(@Lh0^6(y;fDNy^%=066Uhm~z#@(l4M9~xA5@QMIBtY?FltN~gBtu%&do+)oM$~z zpE;2l)UpT~nKU1nFb8$>>rrD+Kbn>>CenKNR<4@|`Wsd4;XKKBB{wRswq@|b1Z9%O zvD^N6Zg$?W+}G4o&eADs*^^DzEjD8J5{s7@yt4vMZOFp!ISwvZ4Vrzx?>v7ZZma*iaNU#7n>7c%`UVAGcG-?9r!w>A*i1 z-2ww%U0h+%42j`LY+54?4Tj~VONKzWL!+cV`FODp7CK2m8`;!N+JYoFKnC}bfst8` z`wleo)VMbCw>Zw$g6d^mV>Ggzbo7$;KGGK@L!%`6C=D5dRwc4ciQL3rcP)KQLtJH7 zbIWdI*@&Zd(nM;TFYX~*f@E`ughV3k{gd?Ql*E$j_?7&Z92bLn#vwriGY!0-nO44w zneBXl89%>Z7qh$gZf5L3R?Zrdbq_3rRF9>pNH|}eM)5+bx~T|jyqWyFN%9u27m_z5 zvJwCg~=8wi1PF=z<|ku+e(&d^oivDOoZ}E zjbUz;WWkS0+_+|4j^U8Kmka7&fKg*C2a8KRT1*(kBQYCxm0IC(rOZ>MCOCjUE%kvN zAt@cd_7%VyT6Qfhxx37zH787BO3aMoWzPRA|5;g@rZbVIl&_{^0ryJ_ha7VU)ODh~ z0&?(^@_oPB08je9-IxvQ zDwrMzDp+$_N^@^kG{R2QR66fnA_dn~8t>}&NF|FuU0DcD{IJpvRyx1gXss#+7gJG{ z*Kfr@Ne(*PS{$pot&bPd7U*=o>X7$6hyYKZv3iLer$qknC)e{EZbqom5wEQWi-k-(s1I35!B zRXGYhX^3_1FQt9DYD*gH9V(RML~Y-uMM9oJH~;<<*|B^mp!rH)`=RgaZYRZWynEShwZ zhuTR)2kF>KGCHrNZo?a^hVpH?mR#XNp_}3*i>}2fT(Y?-UiRo(+`{FO%oaC3vikG1 zztc;4RAM%}bsB>|VtmbJI4fUTqgQK0iD~fR`tQPwJn*mz;16s*`Sn>h}mnlpOQdEqEf*##BsQ<$bOj37L3lf;?%JbI#}+&kgW z(i-Kz27IFRnEECS!&+~KT+?baCFj93xozs$>v92Ny6*o)5v)&Y;71$Mv% zY}z$Amc=raKqWgr|kOufa&wE$beDko<=XjdOaF3k zqQHRhi)yx7f$v8>8jD6hTZFoy?=LFUxF-y=95uwhW6lP{q(4fAW28Se6^wt$&Sw4c kg`u5%(zGxb?d=VTxL7KHM%*a1r;O>r^G}YUXrl%G2g{(;2><{9 diff --git a/core/matching_engine.py b/core/matching_engine.py index 31fd2c98..40d267ce 100644 --- a/core/matching_engine.py +++ b/core/matching_engine.py @@ -36,12 +36,13 @@ class MusicMatchingEngine: ] self.artist_patterns = [ + # Only remove featured artists, not parts of main artist names r'\s*feat\..*', r'\s*ft\..*', r'\s*featuring.*', - r'\s*&.*', - r'\s*and.*', - r',.*' + # REMOVED: r'\s*&.*' - This breaks "Daryl Hall & John Oates", "Blood & Water" + # REMOVED: r'\s*and.*' - This breaks artist names with "and" + # REMOVED: r',.*' - This can break legitimate artist names with commas ] def normalize_string(self, text: str) -> str: @@ -204,10 +205,13 @@ class MusicMatchingEngine: plex_core_title = self.get_core_string(plex_track.title) if spotify_core_title and spotify_core_title == plex_core_title: - # If the core titles are identical, we are highly confident. - # The final score is a high base (0.9) plus a bonus for artist similarity. - confidence = 0.90 + (artist_score * 0.09) # Max score of 0.99 - return confidence, "core_title_match" + # SAFETY CHECK: Only give high confidence if artist also matches reasonably well + # This prevents "Artist A - Girls" from matching "Artist Z - Girls" with high confidence + if artist_score >= 0.75: # Require decent artist match + # If the core titles are identical and artists match, we are highly confident + confidence = 0.90 + (artist_score * 0.09) # Max score of 0.99 + return confidence, "core_title_match" + # If artist score is too low, fall through to standard weighted calculation # --- Priority 2: Fuzzy Title Match (for variations, typos, etc.) --- spotify_title_cleaned = self.clean_title(spotify_track.name) @@ -294,17 +298,17 @@ class MusicMatchingEngine: # SAFETY CHECK: Don't return empty or too-short titles if not cleaned_title or len(cleaned_title.strip()) < 2: - print(f"⚠️ Album removal would create empty title: '{original_title}' → '{cleaned_title}' - keeping original") + logger.warning(f"Album removal would create empty title: '{original_title}' → '{cleaned_title}' - keeping original") return track_title, False # SAFETY CHECK: Don't remove if it would leave only articles or very short words words = cleaned_title.split() meaningful_words = [w for w in words if len(w) > 2 and w.lower() not in ['the', 'and', 'or', 'of', 'a', 'an']] if not meaningful_words: - print(f"⚠️ Album removal would leave only short words: '{original_title}' → '{cleaned_title}' - keeping original") + logger.warning(f"Album removal would leave only short words: '{original_title}' → '{cleaned_title}' - keeping original") return track_title, False - print(f"🎵 Detected album in title: '{original_title}' → '{cleaned_title}' (removed: '{match.group(1)}', similarity: {similarity:.2f})") + logger.debug(f"Detected album in title: '{original_title}' → '{cleaned_title}' (removed: '{match.group(1)}', similarity: {similarity:.2f})") return cleaned_title, True # Fallback: detect common album-like suffixes even without album context @@ -363,7 +367,7 @@ class MusicMatchingEngine: cleaned_track = self.clean_title(cleaned_title) if cleaned_track: queries.append(f"{artist} {cleaned_track}".strip()) - print(f"🎯 PRIORITY 1: Album-cleaned query: '{artist} {cleaned_track}'") + logger.debug(f"PRIORITY 1: Album-cleaned query: '{artist} {cleaned_track}'") # PRIORITY 2: Try simplified versions, but preserve important version info # Only remove content that's likely to be album names or noise, not version info @@ -392,9 +396,9 @@ class MusicMatchingEngine: dash_clean = self.clean_title(title_part) if dash_clean and dash_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]: queries.append(f"{artist} {dash_clean}".strip()) - print(f"🎯 PRIORITY 2: Dash-cleaned query (removed album): '{artist} {dash_clean}'") + logger.debug(f"PRIORITY 2: Dash-cleaned query (removed album): '{artist} {dash_clean}'") elif should_preserve: - print(f"🎯 PRESERVED: Keeping dash content '{dash_content}' as it appears to be version info") + logger.debug(f"PRESERVED: Keeping dash content '{dash_content}' as it appears to be version info") # Pattern 2: Only remove parentheses that contain noise (feat, explicit, etc), not version info # Check if parentheses contain version-related keywords before removing @@ -425,16 +429,16 @@ class MusicMatchingEngine: simple_clean = self.clean_title(simple_title) if simple_clean and simple_clean not in [self.clean_title(q.split(' ', 1)[1]) for q in queries if ' ' in q]: queries.append(f"{artist} {simple_clean}".strip()) - print(f"🎯 PRIORITY 2: Noise-removed query: '{artist} {simple_clean}'") + logger.debug(f"PRIORITY 2: Noise-removed query: '{artist} {simple_clean}'") elif is_version: - print(f"🎯 PRESERVED: Keeping parentheses content '({paren_content})' as it appears to be version info") + logger.debug(f"PRESERVED: Keeping parentheses content '({paren_content})' as it appears to be version info") # PRIORITY 3: Original query (ONLY if no album was detected or if it's different) original_track_clean = self.clean_title(original_title) if not album_detected or not queries: # Only add original if no album detected or no other queries if original_track_clean not in [q.split(' ', 1)[1] for q in queries if ' ' in q]: queries.append(f"{artist} {original_track_clean}".strip()) - print(f"🎯 PRIORITY 3: Original query: '{artist} {original_track_clean}'") + logger.debug(f"PRIORITY 3: Original query: '{artist} {original_track_clean}'") # Remove duplicates while preserving order unique_queries = [] @@ -492,7 +496,7 @@ class MusicMatchingEngine: quality_bonus = 0.0 if slskd_track.quality: if slskd_track.quality.lower() == 'flac': - quality_bonus = 0.1 + quality_bonus = 0.07 # Reduced from 0.1 to prevent low-confidence FLAC beating high-confidence MP3 elif slskd_track.quality.lower() == 'mp3' and (slskd_track.bitrate or 0) >= 320: quality_bonus = 0.05