fix(set-walker): preserve raw source for function calls and delimited idents

The ParserSQL-backed SET walker (algorithm 3) was round-tripping RHS values
through the Emitter, which normalises whitespace (adds ", " between function
call args) and drops delimiter quotes on identifiers (NODE_IDENTIFIER
emits its content without re-quoting). Both behaviours diverged from the
regex-based parsers used in algorithms 0-2.

The function-call drift cascaded across set_testing-t because the wrong
`sql_mode = concat(@@sql_mode, ',X')` value (with extra space) persisted on
the connection and tripped every subsequent variable comparison until reset.
The delimited-ident drift caused pgsql-set_parameter_validation_test-t to
treat `"MixedCase"` as `mixedcase` and `"$user"` as the current-user
substitution token (rather than the literal identifier the user wrote).

Walker fixes:
  - NODE_FUNCTION_CALL -> paren-match in original query buffer from
    value_ptr, return the verbatim substring.
  - NODE_IDENTIFIER/NODE_COLUMN_REF with FLAG_IDENT_DELIMITED -> splice the
    surrounding quote chars back in from value_ptr-1 / value_ptr+value_len.

Both helpers are no-ops when value_ptr is outside the query buffer (defensive
fallback to the existing emit_node_text path).

Tests: adds two strict (byte-exact) regression suites in
setparser_parsersql_test.cpp -- the existing normalize_value() collapses
whitespace and quote-style differences, which is what hid the bugs in the
first place. The new TestStrictFunctionCall and TestStrictPgsqlIdent
helpers compare verbatim so any future re-introduction is caught.

Local: 288/288 unit tests pass.
pull/5809/head
Rene Cannao 1 month ago
parent 5c9a548292
commit b54aaa5f76

@ -170,6 +170,55 @@ static std::string extract_paren_expr(const char* query, int query_len,
return std::string(start, p);
}
// Extract the verbatim source text of a function-call AST node by paren-matching
// from the function name in the original input. Used to avoid emit_function_call's
// "name(arg, arg)" normalisation (which adds a space after every comma) so that the
// SET walker preserves the exact source the user wrote, matching the regex-based
// SET parsers (algorithms 0-2) byte-for-byte. Returns empty string if value_ptr is
// not inside [query, query+query_len) or no balanced paren is found.
static std::string extract_function_call_source(
const AstNode* node, const char* query, int query_len)
{
if (!node || !node->value_ptr || node->value_len == 0) return "";
const char* qstart = query;
const char* qend = query + query_len;
if (node->value_ptr < qstart || node->value_ptr >= qend) return "";
const char* start = node->value_ptr;
const char* p = start + node->value_len;
while (p < qend && (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')) p++;
if (p >= qend || *p != '(') return "";
int depth = 0;
while (p < qend) {
if (*p == '\'' || *p == '"' || *p == '`') { skip_quoted_char(p, qend); }
else if (*p == '(') depth++;
else if (*p == ')') { depth--; if (depth == 0) { p++; break; } }
p++;
}
if (depth != 0) return "";
return std::string(start, p);
}
// Re-emit a delimited identifier ("name" in PG, `name` in MySQL) with its outer
// quote chars restored. The AST stores value_ptr pointing inside the quotes and
// value_len covering only the identifier content, so the surrounding quote chars
// live at value_ptr-1 and value_ptr+value_len in the original query buffer.
// Returns empty string if the node is not in-buffer or the surrounding chars
// don't look like recognised quote chars.
static std::string emit_delimited_ident_raw(
const AstNode* node, const char* query, int query_len)
{
if (!node || !node->value_ptr || node->value_len == 0) return "";
const char* qstart = query;
const char* qend = query + query_len;
if (node->value_ptr <= qstart) return "";
if (node->value_ptr + node->value_len >= qend) return "";
char open_q = *(node->value_ptr - 1);
char close_q = *(node->value_ptr + node->value_len);
if (open_q != '"' && open_q != '`') return "";
if (open_q != close_q) return "";
return std::string(node->value_ptr - 1, node->value_ptr + node->value_len + 1);
}
// ---------------------------------------------------------------------------
// Section 1: Digest adapter
// ---------------------------------------------------------------------------
@ -379,6 +428,27 @@ static std::string resolve_var_value(
}
return "";
}
// Function calls round-trip lossily through emit_function_call (it injects
// ", " between arguments regardless of the input). Reach back into the
// original query and copy the source verbatim instead. Matches the
// behaviour of the regex-based SET parsers used in algorithms 0-2.
if (rhs->type == NodeType::NODE_FUNCTION_CALL) {
std::string raw = extract_function_call_source(rhs, query, query_len);
if (!raw.empty()) return raw;
}
// Delimited identifiers (`"$user"`, `"MixedCase"`, `"sch-1"`) carry
// FLAG_IDENT_DELIMITED but value_ptr/value_len cover only the content
// between the quotes -- the emitter would re-emit the bare identifier,
// losing the delimiters that downstream validators need (e.g. the PG
// search_path validator distinguishes literal "$user" from the $user
// current-user substitution token, "MixedCase" from case-folded
// mixedcase, etc.). Splice the quotes back in from the original buffer.
if ((rhs->type == NodeType::NODE_IDENTIFIER ||
rhs->type == NodeType::NODE_COLUMN_REF) &&
(rhs->flags & FLAG_IDENT_DELIMITED)) {
std::string raw = emit_delimited_ident_raw(rhs, query, query_len);
if (!raw.empty()) return raw;
}
return emit_node_text<D>(rhs, arena);
}

@ -27,6 +27,47 @@ static Test parsersql_syntax_errors[] = {
{ Expected("sql_mode", { "SELCT" } ) } },
};
// Byte-exact regression tests for the walker's function-call source preservation
// (pairs of input -> expected verbatim value). The shared `TestParse` strips
// whitespace and quote-style differences via normalize_value(), which hid an
// earlier round-trip bug where emit_function_call injected ", " between
// arguments: `concat(@@sql_mode,'X')` → `concat(@@sql_mode, 'X')`. The version
// drift carried via session tracking and broke set_testing-t. These cases must
// compare byte-for-byte to catch any future regression in the same area.
struct StrictCase {
const char* query;
const char* var;
const char* expected;
};
static StrictCase parsersql_function_call_strict[] = {
{ "SET sql_mode = concat(@@sql_mode,',STRICT_TRANS_TABLES')",
"sql_mode", "concat(@@sql_mode,',STRICT_TRANS_TABLES')" },
{ "SET sql_mode = CONCAT(@@sql_mode, ',STRICT_TRANS_TABLES')",
"sql_mode", "CONCAT(@@sql_mode, ',STRICT_TRANS_TABLES')" },
{ "SET sql_mode = concat( @@sql_mode , 'X' )",
"sql_mode", "concat( @@sql_mode , 'X' )" },
};
// Byte-exact regression tests for PG delimited identifier preservation in SET RHS.
// The AST stores value_ptr inside the quotes and value_len covering only the
// identifier content, so the walker has to splice the quote chars back in when
// FLAG_IDENT_DELIMITED is set. Drives the search_path-specific failures in
// pgsql-set_parameter_validation_test-t where `"MixedCase"` was being lowercased
// to `mixedcase` and `"$user"` was being mistaken for the current-user substitution.
struct StrictPgsqlCase {
const char* query;
const char* var;
std::vector<std::string> expected_values;
};
static StrictPgsqlCase parsersql_pgsql_ident_strict[] = {
{ "SET search_path = \"MixedCase\"", "search_path", { "\"MixedCase\"" } },
{ "SET search_path = \"MixedCase\", public", "search_path", { "\"MixedCase\"", "public" } },
{ "SET search_path = \"$user\"", "search_path", { "\"$user\"" } },
{ "SET search_path TO \"$user\", public", "search_path", { "\"$user\"", "public" } },
{ "SET search_path = \"sch-1\", \"sch 2\"", "search_path", { "\"sch-1\"", "\"sch 2\"" } },
{ "SET search_path = pg_catalog, \"$user\"", "search_path", { "pg_catalog", "\"$user\"" } },
};
// ----------------------------------------------------------------------------
// MySQL queries from test_filtered_set_statements-t (variables that ProxySQL
// is supposed to filter out — should still parse cleanly via ParserSQL).
@ -242,6 +283,54 @@ void TestParsePgsql(const Test* tests, int ntests, const std::string& title) {
}
void TestStrictFunctionCall(const StrictCase* cases, int n) {
for (int i = 0; i < n; i++) {
auto result = parsersql_parse_set_mysql(cases[i].query);
auto it = result.find(cases[i].var);
bool found = (it != result.end() && it->second.size() == 1);
ok(found, "[strict_function_call %d] var '%s' present with single value for query: %s",
i, cases[i].var, cases[i].query);
if (found) {
bool eq = (it->second[0] == cases[i].expected);
ok(eq, "[strict_function_call %d] byte-exact match for: %s", i, cases[i].query);
if (!eq) {
diag(" expected: [%s]", cases[i].expected);
diag(" got : [%s]", it->second[0].c_str());
}
} else {
ok(false, "[strict_function_call %d] cannot byte-compare (var missing or multi-value)", i);
}
}
}
void TestStrictPgsqlIdent(const StrictPgsqlCase* cases, int n) {
for (int i = 0; i < n; i++) {
auto result = parsersql_parse_set_pgsql(cases[i].query);
auto it = result.find(cases[i].var);
bool found = (it != result.end()
&& it->second.size() == cases[i].expected_values.size());
ok(found, "[strict_pgsql_ident %d] var '%s' present with %zu value(s) for query: %s",
i, cases[i].var, cases[i].expected_values.size(), cases[i].query);
if (found) {
bool eq = true;
for (size_t j = 0; j < cases[i].expected_values.size(); ++j) {
if (it->second[j] != cases[i].expected_values[j]) { eq = false; break; }
}
ok(eq, "[strict_pgsql_ident %d] byte-exact match for: %s", i, cases[i].query);
if (!eq) {
for (size_t j = 0; j < cases[i].expected_values.size(); ++j) {
diag(" expected[%zu]: [%s]", j, cases[i].expected_values[j].c_str());
}
for (size_t j = 0; j < it->second.size(); ++j) {
diag(" got[%zu] : [%s]", j, it->second[j].c_str());
}
}
} else {
ok(false, "[strict_pgsql_ident %d] cannot byte-compare (var missing or count mismatch)", i);
}
}
}
int main(int argc, char** argv) {
unsigned int p = 0;
p += arraysize(sql_mode);
@ -258,6 +347,8 @@ int main(int argc, char** argv) {
p += arraysize(parsersql_pgsql_search_path);
p += arraysize(parsersql_pgsql_time_zone);
p *= 2;
p += arraysize(parsersql_function_call_strict) * 2;
p += arraysize(parsersql_pgsql_ident_strict) * 2;
plan(p);
TestParse(sql_mode, arraysize(sql_mode), "sql_mode");
TestParse(time_zone, arraysize(time_zone), "time_zone");
@ -272,6 +363,8 @@ int main(int argc, char** argv) {
TestParse(parsersql_mysql_set_testing, arraysize(parsersql_mysql_set_testing), "mysql_set_testing");
TestParsePgsql(parsersql_pgsql_search_path, arraysize(parsersql_pgsql_search_path), "pgsql_search_path");
TestParsePgsql(parsersql_pgsql_time_zone, arraysize(parsersql_pgsql_time_zone), "pgsql_time_zone");
TestStrictFunctionCall(parsersql_function_call_strict, arraysize(parsersql_function_call_strict));
TestStrictPgsqlIdent(parsersql_pgsql_ident_strict, arraysize(parsersql_pgsql_ident_strict));
return exit_status();
}

Loading…
Cancel
Save