From b54aaa5f76ca1849c216e76bcf55a89649970574 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Sat, 23 May 2026 17:57:45 +0000 Subject: [PATCH] fix(set-walker): preserve raw source for function calls and delimited idents The ParserSQL-backed SET walker (algorithm 3) was round-tripping RHS values through the Emitter, which normalises whitespace (adds ", " between function call args) and drops delimiter quotes on identifiers (NODE_IDENTIFIER emits its content without re-quoting). Both behaviours diverged from the regex-based parsers used in algorithms 0-2. The function-call drift cascaded across set_testing-t because the wrong `sql_mode = concat(@@sql_mode, ',X')` value (with extra space) persisted on the connection and tripped every subsequent variable comparison until reset. The delimited-ident drift caused pgsql-set_parameter_validation_test-t to treat `"MixedCase"` as `mixedcase` and `"$user"` as the current-user substitution token (rather than the literal identifier the user wrote). Walker fixes: - NODE_FUNCTION_CALL -> paren-match in original query buffer from value_ptr, return the verbatim substring. - NODE_IDENTIFIER/NODE_COLUMN_REF with FLAG_IDENT_DELIMITED -> splice the surrounding quote chars back in from value_ptr-1 / value_ptr+value_len. Both helpers are no-ops when value_ptr is outside the query buffer (defensive fallback to the existing emit_node_text path). Tests: adds two strict (byte-exact) regression suites in setparser_parsersql_test.cpp -- the existing normalize_value() collapses whitespace and quote-style differences, which is what hid the bugs in the first place. The new TestStrictFunctionCall and TestStrictPgsqlIdent helpers compare verbatim so any future re-introduction is caught. Local: 288/288 unit tests pass. --- lib/Query_Processor_ParserSQL.cpp | 70 ++++++++++++++++ test/tap/tests/setparser_parsersql_test.cpp | 93 +++++++++++++++++++++ 2 files changed, 163 insertions(+) diff --git a/lib/Query_Processor_ParserSQL.cpp b/lib/Query_Processor_ParserSQL.cpp index 098d8d5aa..3c65a2bcd 100644 --- a/lib/Query_Processor_ParserSQL.cpp +++ b/lib/Query_Processor_ParserSQL.cpp @@ -170,6 +170,55 @@ static std::string extract_paren_expr(const char* query, int query_len, return std::string(start, p); } +// Extract the verbatim source text of a function-call AST node by paren-matching +// from the function name in the original input. Used to avoid emit_function_call's +// "name(arg, arg)" normalisation (which adds a space after every comma) so that the +// SET walker preserves the exact source the user wrote, matching the regex-based +// SET parsers (algorithms 0-2) byte-for-byte. Returns empty string if value_ptr is +// not inside [query, query+query_len) or no balanced paren is found. +static std::string extract_function_call_source( + const AstNode* node, const char* query, int query_len) +{ + if (!node || !node->value_ptr || node->value_len == 0) return ""; + const char* qstart = query; + const char* qend = query + query_len; + if (node->value_ptr < qstart || node->value_ptr >= qend) return ""; + const char* start = node->value_ptr; + const char* p = start + node->value_len; + while (p < qend && (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')) p++; + if (p >= qend || *p != '(') return ""; + int depth = 0; + while (p < qend) { + if (*p == '\'' || *p == '"' || *p == '`') { skip_quoted_char(p, qend); } + else if (*p == '(') depth++; + else if (*p == ')') { depth--; if (depth == 0) { p++; break; } } + p++; + } + if (depth != 0) return ""; + return std::string(start, p); +} + +// Re-emit a delimited identifier ("name" in PG, `name` in MySQL) with its outer +// quote chars restored. The AST stores value_ptr pointing inside the quotes and +// value_len covering only the identifier content, so the surrounding quote chars +// live at value_ptr-1 and value_ptr+value_len in the original query buffer. +// Returns empty string if the node is not in-buffer or the surrounding chars +// don't look like recognised quote chars. +static std::string emit_delimited_ident_raw( + const AstNode* node, const char* query, int query_len) +{ + if (!node || !node->value_ptr || node->value_len == 0) return ""; + const char* qstart = query; + const char* qend = query + query_len; + if (node->value_ptr <= qstart) return ""; + if (node->value_ptr + node->value_len >= qend) return ""; + char open_q = *(node->value_ptr - 1); + char close_q = *(node->value_ptr + node->value_len); + if (open_q != '"' && open_q != '`') return ""; + if (open_q != close_q) return ""; + return std::string(node->value_ptr - 1, node->value_ptr + node->value_len + 1); +} + // --------------------------------------------------------------------------- // Section 1: Digest adapter // --------------------------------------------------------------------------- @@ -379,6 +428,27 @@ static std::string resolve_var_value( } return ""; } + // Function calls round-trip lossily through emit_function_call (it injects + // ", " between arguments regardless of the input). Reach back into the + // original query and copy the source verbatim instead. Matches the + // behaviour of the regex-based SET parsers used in algorithms 0-2. + if (rhs->type == NodeType::NODE_FUNCTION_CALL) { + std::string raw = extract_function_call_source(rhs, query, query_len); + if (!raw.empty()) return raw; + } + // Delimited identifiers (`"$user"`, `"MixedCase"`, `"sch-1"`) carry + // FLAG_IDENT_DELIMITED but value_ptr/value_len cover only the content + // between the quotes -- the emitter would re-emit the bare identifier, + // losing the delimiters that downstream validators need (e.g. the PG + // search_path validator distinguishes literal "$user" from the $user + // current-user substitution token, "MixedCase" from case-folded + // mixedcase, etc.). Splice the quotes back in from the original buffer. + if ((rhs->type == NodeType::NODE_IDENTIFIER || + rhs->type == NodeType::NODE_COLUMN_REF) && + (rhs->flags & FLAG_IDENT_DELIMITED)) { + std::string raw = emit_delimited_ident_raw(rhs, query, query_len); + if (!raw.empty()) return raw; + } return emit_node_text(rhs, arena); } diff --git a/test/tap/tests/setparser_parsersql_test.cpp b/test/tap/tests/setparser_parsersql_test.cpp index 3fa8d4e63..20f6ffa80 100644 --- a/test/tap/tests/setparser_parsersql_test.cpp +++ b/test/tap/tests/setparser_parsersql_test.cpp @@ -27,6 +27,47 @@ static Test parsersql_syntax_errors[] = { { Expected("sql_mode", { "SELCT" } ) } }, }; +// Byte-exact regression tests for the walker's function-call source preservation +// (pairs of input -> expected verbatim value). The shared `TestParse` strips +// whitespace and quote-style differences via normalize_value(), which hid an +// earlier round-trip bug where emit_function_call injected ", " between +// arguments: `concat(@@sql_mode,'X')` → `concat(@@sql_mode, 'X')`. The version +// drift carried via session tracking and broke set_testing-t. These cases must +// compare byte-for-byte to catch any future regression in the same area. +struct StrictCase { + const char* query; + const char* var; + const char* expected; +}; +static StrictCase parsersql_function_call_strict[] = { + { "SET sql_mode = concat(@@sql_mode,',STRICT_TRANS_TABLES')", + "sql_mode", "concat(@@sql_mode,',STRICT_TRANS_TABLES')" }, + { "SET sql_mode = CONCAT(@@sql_mode, ',STRICT_TRANS_TABLES')", + "sql_mode", "CONCAT(@@sql_mode, ',STRICT_TRANS_TABLES')" }, + { "SET sql_mode = concat( @@sql_mode , 'X' )", + "sql_mode", "concat( @@sql_mode , 'X' )" }, +}; + +// Byte-exact regression tests for PG delimited identifier preservation in SET RHS. +// The AST stores value_ptr inside the quotes and value_len covering only the +// identifier content, so the walker has to splice the quote chars back in when +// FLAG_IDENT_DELIMITED is set. Drives the search_path-specific failures in +// pgsql-set_parameter_validation_test-t where `"MixedCase"` was being lowercased +// to `mixedcase` and `"$user"` was being mistaken for the current-user substitution. +struct StrictPgsqlCase { + const char* query; + const char* var; + std::vector expected_values; +}; +static StrictPgsqlCase parsersql_pgsql_ident_strict[] = { + { "SET search_path = \"MixedCase\"", "search_path", { "\"MixedCase\"" } }, + { "SET search_path = \"MixedCase\", public", "search_path", { "\"MixedCase\"", "public" } }, + { "SET search_path = \"$user\"", "search_path", { "\"$user\"" } }, + { "SET search_path TO \"$user\", public", "search_path", { "\"$user\"", "public" } }, + { "SET search_path = \"sch-1\", \"sch 2\"", "search_path", { "\"sch-1\"", "\"sch 2\"" } }, + { "SET search_path = pg_catalog, \"$user\"", "search_path", { "pg_catalog", "\"$user\"" } }, +}; + // ---------------------------------------------------------------------------- // MySQL queries from test_filtered_set_statements-t (variables that ProxySQL // is supposed to filter out — should still parse cleanly via ParserSQL). @@ -242,6 +283,54 @@ void TestParsePgsql(const Test* tests, int ntests, const std::string& title) { } +void TestStrictFunctionCall(const StrictCase* cases, int n) { + for (int i = 0; i < n; i++) { + auto result = parsersql_parse_set_mysql(cases[i].query); + auto it = result.find(cases[i].var); + bool found = (it != result.end() && it->second.size() == 1); + ok(found, "[strict_function_call %d] var '%s' present with single value for query: %s", + i, cases[i].var, cases[i].query); + if (found) { + bool eq = (it->second[0] == cases[i].expected); + ok(eq, "[strict_function_call %d] byte-exact match for: %s", i, cases[i].query); + if (!eq) { + diag(" expected: [%s]", cases[i].expected); + diag(" got : [%s]", it->second[0].c_str()); + } + } else { + ok(false, "[strict_function_call %d] cannot byte-compare (var missing or multi-value)", i); + } + } +} + +void TestStrictPgsqlIdent(const StrictPgsqlCase* cases, int n) { + for (int i = 0; i < n; i++) { + auto result = parsersql_parse_set_pgsql(cases[i].query); + auto it = result.find(cases[i].var); + bool found = (it != result.end() + && it->second.size() == cases[i].expected_values.size()); + ok(found, "[strict_pgsql_ident %d] var '%s' present with %zu value(s) for query: %s", + i, cases[i].var, cases[i].expected_values.size(), cases[i].query); + if (found) { + bool eq = true; + for (size_t j = 0; j < cases[i].expected_values.size(); ++j) { + if (it->second[j] != cases[i].expected_values[j]) { eq = false; break; } + } + ok(eq, "[strict_pgsql_ident %d] byte-exact match for: %s", i, cases[i].query); + if (!eq) { + for (size_t j = 0; j < cases[i].expected_values.size(); ++j) { + diag(" expected[%zu]: [%s]", j, cases[i].expected_values[j].c_str()); + } + for (size_t j = 0; j < it->second.size(); ++j) { + diag(" got[%zu] : [%s]", j, it->second[j].c_str()); + } + } + } else { + ok(false, "[strict_pgsql_ident %d] cannot byte-compare (var missing or count mismatch)", i); + } + } +} + int main(int argc, char** argv) { unsigned int p = 0; p += arraysize(sql_mode); @@ -258,6 +347,8 @@ int main(int argc, char** argv) { p += arraysize(parsersql_pgsql_search_path); p += arraysize(parsersql_pgsql_time_zone); p *= 2; + p += arraysize(parsersql_function_call_strict) * 2; + p += arraysize(parsersql_pgsql_ident_strict) * 2; plan(p); TestParse(sql_mode, arraysize(sql_mode), "sql_mode"); TestParse(time_zone, arraysize(time_zone), "time_zone"); @@ -272,6 +363,8 @@ int main(int argc, char** argv) { TestParse(parsersql_mysql_set_testing, arraysize(parsersql_mysql_set_testing), "mysql_set_testing"); TestParsePgsql(parsersql_pgsql_search_path, arraysize(parsersql_pgsql_search_path), "pgsql_search_path"); TestParsePgsql(parsersql_pgsql_time_zone, arraysize(parsersql_pgsql_time_zone), "pgsql_time_zone"); + TestStrictFunctionCall(parsersql_function_call_strict, arraysize(parsersql_function_call_strict)); + TestStrictPgsqlIdent(parsersql_pgsql_ident_strict, arraysize(parsersql_pgsql_ident_strict)); return exit_status(); }