Fix Japanese/CJK text mangled in Soulseek search queries

normalize_string() was running unidecode on all text, converting Japanese kanji to Chinese pinyin gibberish (命の灯火 → "tvanimedei"). Now detects CJK characters (kanji, hiragana, katakana, hangul, fullwidth forms) and skips unidecode for text containing them — just lowercases instead. Non-CJK text (Latin accents, Cyrillic) still goes through unidecode normally.
2 months ago · d944d4a7d2
parent 1646c3d9e1
commit d944d4a7d2
3 changed files with 20 additions and 2 deletions
--- a/core/matching_engine.py
+++ b/core/matching_engine.py
@ -67,8 +67,16 @@ class MusicMatchingEngine:
        # Apply the character replacements before other normalization steps
        for original, replacement in char_map.items():
            text = text.replace(original, replacement)
-        text = unidecode(text)
-        text = text.lower()
+
+        # Skip unidecode for CJK text — it converts Japanese kanji to Chinese pinyin,
+        # producing gibberish like "tvanimedei" for "命の灯火". Preserve original characters
+        # so Soulseek searches use the real title. Only apply unidecode to non-CJK text.
+        if any('\u2e80' <= c <= '\u9fff' or '\u3040' <= c <= '\u30ff' or '\uff00' <= c <= '\uffef' or '\uac00' <= c <= '\ud7af' for c in text):
+            # CJK detected — just lowercase, don't transliterate
+            text = text.lower()
+        else:
+            text = unidecode(text)
+            text = text.lower()
        
        # Expand specific abbreviations for better matching
        abbreviation_map = {
--- a/web_server.py
+++ b/web_server.py
@ -19246,6 +19246,15 @@ def get_version_info():
        "title": "What's New in SoulSync",
        "subtitle": f"Version {SOULSYNC_VERSION} — Latest Changes",
        "sections": [
+            {
+                "title": "🔧 Fix Japanese Song Searches Producing Gibberish",
+                "description": "CJK text no longer mangled by unidecode in Soulseek search queries",
+                "features": [
+                    "• Japanese kanji, hiragana, katakana, and Korean hangul preserved in search queries",
+                    "• unidecode was converting Japanese to Chinese pinyin (e.g. 命の灯火 → 'tvanimedei')",
+                    "• Soulseek users typically share files with original CJK characters in filenames"
+                ]
+            },
            {
                "title": "🔧 Fix Partial Name Matching False Positives (#225)",
                "description": "Track ownership check no longer falsely matches prefix/suffix variations",
--- a/webui/static/helper.js
+++ b/webui/static/helper.js
@ -3403,6 +3403,7 @@ function closeHelperSearch() {
 const WHATS_NEW = {
    '2.1': [
        // Newest features first
+        { title: 'Fix Japanese/CJK Soulseek Searches',       desc: 'Japanese kanji no longer mangled into Chinese pinyin — searches now use original characters' },
        { title: 'Fix Partial Title Matching',               desc: '"Believe" no longer falsely matches "Believe In Me" — length ratio penalty prevents prefix false positives' },
        { title: 'Fix Pipeline Blocking on Discovery Fail',  desc: 'Playlist sync no longer drops tracks that failed metadata discovery — continues with original name/artist for download' },
        { title: 'Playlist Explorer',                         desc: 'New page: expand playlists into visual discovery trees of albums and discographies — select and add to wishlist', page: 'playlist-explorer' },