fix(set-walker): preserve raw source for function calls and delimited idents

The ParserSQL-backed SET walker (algorithm 3) was round-tripping RHS values through the Emitter, which normalises whitespace (adds ", " between function call args) and drops delimiter quotes on identifiers (NODE_IDENTIFIER emits its content without re-quoting). Both behaviours diverged from the regex-based parsers used in algorithms 0-2. The function-call drift cascaded across set_testing-t because the wrong `sql_mode = concat(@@sql_mode, ',X')` value (with extra space) persisted on the connection and tripped every subsequent variable comparison until reset. The delimited-ident drift caused pgsql-set_parameter_validation_test-t to treat `"MixedCase"` as `mixedcase` and `"$user"` as the current-user substitution token (rather than the literal identifier the user wrote). Walker fixes: - NODE_FUNCTION_CALL -> paren-match in original query buffer from value_ptr, return the verbatim substring. - NODE_IDENTIFIER/NODE_COLUMN_REF with FLAG_IDENT_DELIMITED -> splice the surrounding quote chars back in from value_ptr-1 / value_ptr+value_len. Both helpers are no-ops when value_ptr is outside the query buffer (defensive fallback to the existing emit_node_text path). Tests: adds two strict (byte-exact) regression suites in setparser_parsersql_test.cpp -- the existing normalize_value() collapses whitespace and quote-style differences, which is what hid the bugs in the first place. The new TestStrictFunctionCall and TestStrictPgsqlIdent helpers compare verbatim so any future re-introduction is caught. Local: 288/288 unit tests pass.
1 month ago · b54aaa5f76
parent 5c9a548292
commit b54aaa5f76
2 changed files with 163 additions and 0 deletions
--- a/lib/Query_Processor_ParserSQL.cpp
+++ b/lib/Query_Processor_ParserSQL.cpp
@ -170,6 +170,55 @@ static std::string extract_paren_expr(const char* query, int query_len,
    return std::string(start, p);
 }

+// Extract the verbatim source text of a function-call AST node by paren-matching
+// from the function name in the original input. Used to avoid emit_function_call's
+// "name(arg, arg)" normalisation (which adds a space after every comma) so that the
+// SET walker preserves the exact source the user wrote, matching the regex-based
+// SET parsers (algorithms 0-2) byte-for-byte. Returns empty string if value_ptr is
+// not inside [query, query+query_len) or no balanced paren is found.
+static std::string extract_function_call_source(
+    const AstNode* node, const char* query, int query_len)
+{
+    if (!node || !node->value_ptr || node->value_len == 0) return "";
+    const char* qstart = query;
+    const char* qend = query + query_len;
+    if (node->value_ptr < qstart || node->value_ptr >= qend) return "";
+    const char* start = node->value_ptr;
+    const char* p = start + node->value_len;
+    while (p < qend && (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')) p++;
+    if (p >= qend || *p != '(') return "";
+    int depth = 0;
+    while (p < qend) {
+        if (*p == '\'' || *p == '"' || *p == '`') { skip_quoted_char(p, qend); }
+        else if (*p == '(') depth++;
+        else if (*p == ')') { depth--; if (depth == 0) { p++; break; } }
+        p++;
+    }
+    if (depth != 0) return "";
+    return std::string(start, p);
+}
+
+// Re-emit a delimited identifier ("name" in PG, `name` in MySQL) with its outer
+// quote chars restored. The AST stores value_ptr pointing inside the quotes and
+// value_len covering only the identifier content, so the surrounding quote chars
+// live at value_ptr-1 and value_ptr+value_len in the original query buffer.
+// Returns empty string if the node is not in-buffer or the surrounding chars
+// don't look like recognised quote chars.
+static std::string emit_delimited_ident_raw(
+    const AstNode* node, const char* query, int query_len)
+{
+    if (!node || !node->value_ptr || node->value_len == 0) return "";
+    const char* qstart = query;
+    const char* qend = query + query_len;
+    if (node->value_ptr <= qstart) return "";
+    if (node->value_ptr + node->value_len >= qend) return "";
+    char open_q = *(node->value_ptr - 1);
+    char close_q = *(node->value_ptr + node->value_len);
+    if (open_q != '"' && open_q != '`') return "";
+    if (open_q != close_q) return "";
+    return std::string(node->value_ptr - 1, node->value_ptr + node->value_len + 1);
+}
+
 // ---------------------------------------------------------------------------
 // Section 1: Digest adapter
 // ---------------------------------------------------------------------------
@ -379,6 +428,27 @@ static std::string resolve_var_value(
        }
        return "";
    }
+    // Function calls round-trip lossily through emit_function_call (it injects
+    // ", " between arguments regardless of the input). Reach back into the
+    // original query and copy the source verbatim instead. Matches the
+    // behaviour of the regex-based SET parsers used in algorithms 0-2.
+    if (rhs->type == NodeType::NODE_FUNCTION_CALL) {
+        std::string raw = extract_function_call_source(rhs, query, query_len);
+        if (!raw.empty()) return raw;
+    }
+    // Delimited identifiers (`"$user"`, `"MixedCase"`, `"sch-1"`) carry
+    // FLAG_IDENT_DELIMITED but value_ptr/value_len cover only the content
+    // between the quotes -- the emitter would re-emit the bare identifier,
+    // losing the delimiters that downstream validators need (e.g. the PG
+    // search_path validator distinguishes literal "$user" from the $user
+    // current-user substitution token, "MixedCase" from case-folded
+    // mixedcase, etc.). Splice the quotes back in from the original buffer.
+    if ((rhs->type == NodeType::NODE_IDENTIFIER ||
+         rhs->type == NodeType::NODE_COLUMN_REF) &&
+        (rhs->flags & FLAG_IDENT_DELIMITED)) {
+        std::string raw = emit_delimited_ident_raw(rhs, query, query_len);
+        if (!raw.empty()) return raw;
+    }
    return emit_node_text<D>(rhs, arena);
 }

--- a/test/tap/tests/setparser_parsersql_test.cpp
+++ b/test/tap/tests/setparser_parsersql_test.cpp
@ -27,6 +27,47 @@ static Test parsersql_syntax_errors[] = {
    { Expected("sql_mode", { "SELCT" } ) } },
 };

+// Byte-exact regression tests for the walker's function-call source preservation
+// (pairs of input -> expected verbatim value). The shared `TestParse` strips
+// whitespace and quote-style differences via normalize_value(), which hid an
+// earlier round-trip bug where emit_function_call injected ", " between
+// arguments: `concat(@@sql_mode,'X')` → `concat(@@sql_mode, 'X')`. The version
+// drift carried via session tracking and broke set_testing-t. These cases must
+// compare byte-for-byte to catch any future regression in the same area.
+struct StrictCase {
+  const char* query;
+  const char* var;
+  const char* expected;
+};
+static StrictCase parsersql_function_call_strict[] = {
+  { "SET sql_mode = concat(@@sql_mode,',STRICT_TRANS_TABLES')",
+    "sql_mode", "concat(@@sql_mode,',STRICT_TRANS_TABLES')" },
+  { "SET sql_mode = CONCAT(@@sql_mode, ',STRICT_TRANS_TABLES')",
+    "sql_mode", "CONCAT(@@sql_mode, ',STRICT_TRANS_TABLES')" },
+  { "SET sql_mode = concat( @@sql_mode ,  'X' )",
+    "sql_mode", "concat( @@sql_mode ,  'X' )" },
+};
+
+// Byte-exact regression tests for PG delimited identifier preservation in SET RHS.
+// The AST stores value_ptr inside the quotes and value_len covering only the
+// identifier content, so the walker has to splice the quote chars back in when
+// FLAG_IDENT_DELIMITED is set. Drives the search_path-specific failures in
+// pgsql-set_parameter_validation_test-t where `"MixedCase"` was being lowercased
+// to `mixedcase` and `"$user"` was being mistaken for the current-user substitution.
+struct StrictPgsqlCase {
+  const char* query;
+  const char* var;
+  std::vector<std::string> expected_values;
+};
+static StrictPgsqlCase parsersql_pgsql_ident_strict[] = {
+  { "SET search_path = \"MixedCase\"",        "search_path", { "\"MixedCase\"" } },
+  { "SET search_path = \"MixedCase\", public", "search_path", { "\"MixedCase\"", "public" } },
+  { "SET search_path = \"$user\"",            "search_path", { "\"$user\"" } },
+  { "SET search_path TO \"$user\", public",   "search_path", { "\"$user\"", "public" } },
+  { "SET search_path = \"sch-1\", \"sch 2\"",  "search_path", { "\"sch-1\"", "\"sch 2\"" } },
+  { "SET search_path = pg_catalog, \"$user\"", "search_path", { "pg_catalog", "\"$user\"" } },
+};
+
 // ----------------------------------------------------------------------------
 // MySQL queries from test_filtered_set_statements-t (variables that ProxySQL
 // is supposed to filter out — should still parse cleanly via ParserSQL).
@ -242,6 +283,54 @@ void TestParsePgsql(const Test* tests, int ntests, const std::string& title) {
 }


+void TestStrictFunctionCall(const StrictCase* cases, int n) {
+	for (int i = 0; i < n; i++) {
+		auto result = parsersql_parse_set_mysql(cases[i].query);
+		auto it = result.find(cases[i].var);
+		bool found = (it != result.end() && it->second.size() == 1);
+		ok(found, "[strict_function_call %d] var '%s' present with single value for query: %s",
+			i, cases[i].var, cases[i].query);
+		if (found) {
+			bool eq = (it->second[0] == cases[i].expected);
+			ok(eq, "[strict_function_call %d] byte-exact match for: %s", i, cases[i].query);
+			if (!eq) {
+				diag("  expected: [%s]", cases[i].expected);
+				diag("  got     : [%s]", it->second[0].c_str());
+			}
+		} else {
+			ok(false, "[strict_function_call %d] cannot byte-compare (var missing or multi-value)", i);
+		}
+	}
+}
+
+void TestStrictPgsqlIdent(const StrictPgsqlCase* cases, int n) {
+	for (int i = 0; i < n; i++) {
+		auto result = parsersql_parse_set_pgsql(cases[i].query);
+		auto it = result.find(cases[i].var);
+		bool found = (it != result.end()
+			&& it->second.size() == cases[i].expected_values.size());
+		ok(found, "[strict_pgsql_ident %d] var '%s' present with %zu value(s) for query: %s",
+			i, cases[i].var, cases[i].expected_values.size(), cases[i].query);
+		if (found) {
+			bool eq = true;
+			for (size_t j = 0; j < cases[i].expected_values.size(); ++j) {
+				if (it->second[j] != cases[i].expected_values[j]) { eq = false; break; }
+			}
+			ok(eq, "[strict_pgsql_ident %d] byte-exact match for: %s", i, cases[i].query);
+			if (!eq) {
+				for (size_t j = 0; j < cases[i].expected_values.size(); ++j) {
+					diag("  expected[%zu]: [%s]", j, cases[i].expected_values[j].c_str());
+				}
+				for (size_t j = 0; j < it->second.size(); ++j) {
+					diag("  got[%zu]     : [%s]", j, it->second[j].c_str());
+				}
+			}
+		} else {
+			ok(false, "[strict_pgsql_ident %d] cannot byte-compare (var missing or count mismatch)", i);
+		}
+	}
+}
+
 int main(int argc, char** argv) {
 	unsigned int p = 0;
 	p += arraysize(sql_mode);
@ -258,6 +347,8 @@ int main(int argc, char** argv) {
 	p += arraysize(parsersql_pgsql_search_path);
 	p += arraysize(parsersql_pgsql_time_zone);
 	p *= 2;
+	p += arraysize(parsersql_function_call_strict) * 2;
+	p += arraysize(parsersql_pgsql_ident_strict) * 2;
 	plan(p);
 	TestParse(sql_mode, arraysize(sql_mode), "sql_mode");
 	TestParse(time_zone, arraysize(time_zone), "time_zone");
@ -272,6 +363,8 @@ int main(int argc, char** argv) {
 	TestParse(parsersql_mysql_set_testing, arraysize(parsersql_mysql_set_testing), "mysql_set_testing");
 	TestParsePgsql(parsersql_pgsql_search_path, arraysize(parsersql_pgsql_search_path), "pgsql_search_path");
 	TestParsePgsql(parsersql_pgsql_time_zone, arraysize(parsersql_pgsql_time_zone), "pgsql_time_zone");
+	TestStrictFunctionCall(parsersql_function_call_strict, arraysize(parsersql_function_call_strict));
+	TestStrictPgsqlIdent(parsersql_pgsql_ident_strict, arraysize(parsersql_pgsql_ident_strict));

 	return exit_status();
 }