From d944d4a7d240f759ecb80694a9d01efd7c26bc40 Mon Sep 17 00:00:00 2001 From: Broque Thomas <26755000+Nezreka@users.noreply.github.com> Date: Mon, 30 Mar 2026 18:28:25 -0700 Subject: [PATCH] Fix Japanese/CJK text mangled in Soulseek search queries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit normalize_string() was running unidecode on all text, converting Japanese kanji to Chinese pinyin gibberish (命の灯火 → "tvanimedei"). Now detects CJK characters (kanji, hiragana, katakana, hangul, fullwidth forms) and skips unidecode for text containing them — just lowercases instead. Non-CJK text (Latin accents, Cyrillic) still goes through unidecode normally. --- core/matching_engine.py | 12 ++++++++++-- web_server.py | 9 +++++++++ webui/static/helper.js | 1 + 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/core/matching_engine.py b/core/matching_engine.py index 83ee7cab..e5d66d51 100644 --- a/core/matching_engine.py +++ b/core/matching_engine.py @@ -67,8 +67,16 @@ class MusicMatchingEngine: # Apply the character replacements before other normalization steps for original, replacement in char_map.items(): text = text.replace(original, replacement) - text = unidecode(text) - text = text.lower() + + # Skip unidecode for CJK text — it converts Japanese kanji to Chinese pinyin, + # producing gibberish like "tvanimedei" for "命の灯火". Preserve original characters + # so Soulseek searches use the real title. Only apply unidecode to non-CJK text. + if any('\u2e80' <= c <= '\u9fff' or '\u3040' <= c <= '\u30ff' or '\uff00' <= c <= '\uffef' or '\uac00' <= c <= '\ud7af' for c in text): + # CJK detected — just lowercase, don't transliterate + text = text.lower() + else: + text = unidecode(text) + text = text.lower() # Expand specific abbreviations for better matching abbreviation_map = { diff --git a/web_server.py b/web_server.py index 314c924b..e0f094fb 100644 --- a/web_server.py +++ b/web_server.py @@ -19246,6 +19246,15 @@ def get_version_info(): "title": "What's New in SoulSync", "subtitle": f"Version {SOULSYNC_VERSION} — Latest Changes", "sections": [ + { + "title": "🔧 Fix Japanese Song Searches Producing Gibberish", + "description": "CJK text no longer mangled by unidecode in Soulseek search queries", + "features": [ + "• Japanese kanji, hiragana, katakana, and Korean hangul preserved in search queries", + "• unidecode was converting Japanese to Chinese pinyin (e.g. 命の灯火 → 'tvanimedei')", + "• Soulseek users typically share files with original CJK characters in filenames" + ] + }, { "title": "🔧 Fix Partial Name Matching False Positives (#225)", "description": "Track ownership check no longer falsely matches prefix/suffix variations", diff --git a/webui/static/helper.js b/webui/static/helper.js index 5fa5efa4..ca3f0aa3 100644 --- a/webui/static/helper.js +++ b/webui/static/helper.js @@ -3403,6 +3403,7 @@ function closeHelperSearch() { const WHATS_NEW = { '2.1': [ // Newest features first + { title: 'Fix Japanese/CJK Soulseek Searches', desc: 'Japanese kanji no longer mangled into Chinese pinyin — searches now use original characters' }, { title: 'Fix Partial Title Matching', desc: '"Believe" no longer falsely matches "Believe In Me" — length ratio penalty prevents prefix false positives' }, { title: 'Fix Pipeline Blocking on Discovery Fail', desc: 'Playlist sync no longer drops tracks that failed metadata discovery — continues with original name/artist for download' }, { title: 'Playlist Explorer', desc: 'New page: expand playlists into visual discovery trees of albums and discographies — select and add to wishlist', page: 'playlist-explorer' },