From e6bef5c585bb918ba83fec02ea07131eda28ede3 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Thu, 7 May 2026 08:26:36 +0000 Subject: [PATCH] fix(pgsql): typecast handler swallows the rest of the query (#5755) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `process_pg_typecast()` in lib/pgsql_tokenizer.cpp had two related bugs that made any `::TYPENAME` cast in the middle of a PostgreSQL query silently truncate the digest from the cast onward. ## Bug 1: incorrect exit condition After consuming the type name (and optional modifiers / array brackets) the function tested whether the next char was in an enumerated delimiter list (`)`, `(`, `;`, `,`, `+`, `-`, ...) and returned `st_no_mark_found` only if so. For any other char (the common case: a letter starting the next SQL keyword, e.g. the `F` of `FROM`), it returned `st_pg_typecast`, which the dispatcher then handled by: 1. advancing the cursor by one extra char via `inc_proc_pos()` 2. re-entering `process_pg_typecast()` on the next outer-loop iteration Because `tc->started` was already `true`, the re-entry skipped the `::` skip and started "parsing" a new type name from the middle of the next clause — silently consuming `FROM "Inventory" AS i WHERE ((i."TenantId" = $1) AND NOT (i."IsDeleted"))` and producing digests like `SELECT COUNT(*) AND ((((i."State" ...` that drop ~65 chars of FROM/WHERE structure. Fix: the function unconditionally returns `st_no_mark_found` after consuming the typecast. The earlier delimiter check was attempting to distinguish "this is the end of the typecast" from "this is a continuation", but no continuation case actually reaches that point — type modifiers and array brackets are already handled inline above the check, and PG does not allow the cast itself to span a non-modifier/bracket character. ## Bug 2: `tc->started` never reset `pg_typecast_st::started` is a one-shot guard for the `::`-skip entry block. It was set to `true` on first entry but never reset, so a second `::cast` later in the same query (`SELECT 1::int, 2::text`) re-entered with `started=true` and skipped the `::` skip, eating two characters from the wrong position. Fix: reset `started=false` when exiting via the new unconditional `st_no_mark_found` return. ## Side-effect: trailing-whitespace handling The post-type-name "skip whitespace" loop existed to handle modifier forms like `::int (10)` and `::int []` where PG allows whitespace before the modifier/bracket. But it also ate the legitimate separator space in the common form `::int FROM ...`, which would then glue the two tokens together in the digest (`int` consumed, `FROM` glued to the previous token). The new lookahead-conditional skip preserves the trailing space unless the very next non-space character is a modifier `(` or array bracket `[`. ## Test Added 3 regression tests (5 ok() assertions) to `test/tap/tests/unit/pgsql_tokenizer_unit-t.cpp` registered in `unit-tests-g1`: - `test_digest_typecast_followed_by_clause` — full repro from the issue, asserts `from` and `where` survive, asserts the double-quoted identifier survives. - `test_digest_typecast_then_identifier` — bisected minimal `SELECT a::int FROM t`. - `test_digest_typecast_multiple_in_same_query` — covers Bug 2 by using two `::cast` instances followed by a `FROM` clause. The existing 6 typecast tests (typecast_simple, _varchar, _with_modifier, _array, _in_where, _quoted) and the `digest_2_with_typecast` thread-local-wrapper test continue to pass verbatim. Modifier edge cases (`::varchar (255)` with space, `::numeric(10,2)`, `::int[][]`, `::"my type" FROM t`) verified manually with a standalone reproducer. Closes #5755 --- lib/pgsql_tokenizer.cpp | 55 ++++++++++++------- .../tap/tests/unit/pgsql_tokenizer_unit-t.cpp | 48 +++++++++++++++- 2 files changed, 82 insertions(+), 21 deletions(-) diff --git a/lib/pgsql_tokenizer.cpp b/lib/pgsql_tokenizer.cpp index 13cb201e2..20778bb93 100644 --- a/lib/pgsql_tokenizer.cpp +++ b/lib/pgsql_tokenizer.cpp @@ -1245,8 +1245,6 @@ enum p_st process_replace_null(shared_st* shared_st, const options* opts) { static __attribute__((always_inline)) inline enum p_st process_pg_typecast(shared_st* s, pg_typecast_st* tc) { - enum p_st next = st_pg_typecast; - // On entering state if (!tc->started) { tc->started = true; @@ -1307,11 +1305,27 @@ enum p_st process_pg_typecast(shared_st* s, pg_typecast_st* tc) c = (s->q_cur_pos < s->q_len) ? *s->q : '\0'; } - // Skip any whitespace - while (s->q_cur_pos < s->q_len && is_space_char(c)) { - s->q++; - s->q_cur_pos++; - c = (s->q_cur_pos < s->q_len) ? *s->q : '\0'; + // Skip whitespace, but only if it's followed by a modifier or + // array bracket (e.g. `::int (10)`, `::int []`). If the + // whitespace is just a separator before the next token (e.g. + // `::int FROM ...`), preserve it so the dispatcher sees the + // space and emits it between the typecast and the following + // token in the output. Without this lookahead the typecast + // handler used to consume the trailing space and produced + // digests like `count(*)from x` (issue #5755). + if (s->q_cur_pos < s->q_len && is_space_char(c)) { + int saved_pos = s->q_cur_pos; + const char* saved_q = s->q; + while (s->q_cur_pos < s->q_len && is_space_char(c)) { + s->q++; + s->q_cur_pos++; + c = (s->q_cur_pos < s->q_len) ? *s->q : '\0'; + } + if (c != '(' && c != '[') { + s->q = saved_q; + s->q_cur_pos = saved_pos; + c = (s->q_cur_pos < s->q_len) ? *s->q : '\0'; + } } // Handle type modifiers (parentheses with parameters) @@ -1375,18 +1389,21 @@ enum p_st process_pg_typecast(shared_st* s, pg_typecast_st* tc) } } - // End of type name? Now check if we're at a delimiter - if (s->q_cur_pos >= s->q_len || - is_space_char(c) || - c == ')' || c == '(' || c == ';' || c == ',' || - c == '+' || c == '-' || c == '*' || c == '/' || - c == '=' || c == '<' || c == '>' || c == '@' || - c == ']' || c == '[') { - // Exit state - return st_no_mark_found; - } - - return next; + // All typecast consumption (type name + optional whitespace + + // modifiers + array brackets) is complete by this point, so the + // state must always exit. The previous attempt to detect "end of + // typecast" via an enumerated delimiter list dropped through to + // `return next` for any character not in that list (e.g. the 'F' + // of `FROM` after `::INT FROM "Inventory"`), which kept the + // dispatcher in `st_pg_typecast`, advanced the cursor by one extra + // char per outer-loop iteration, and silently swallowed the rest + // of the query. See issue #5755. + // + // Also reset `started` so a subsequent `::cast` later in the same + // query (e.g. `SELECT 1::INT, 2::TEXT`) re-enters cleanly via the + // entry block at the top of this function. + tc->started = false; + return st_no_mark_found; } /** diff --git a/test/tap/tests/unit/pgsql_tokenizer_unit-t.cpp b/test/tap/tests/unit/pgsql_tokenizer_unit-t.cpp index 98738330e..284c713a7 100644 --- a/test/tap/tests/unit/pgsql_tokenizer_unit-t.cpp +++ b/test/tap/tests/unit/pgsql_tokenizer_unit-t.cpp @@ -261,6 +261,47 @@ static void test_digest_typecast_quoted() { "digest: typecast with quoted type name handled"); } +// Regression test for issue #5755: a typecast in the middle of a query +// must not swallow the rest of the query. The bug was in +// process_pg_typecast() which, after consuming `::TYPENAME`, sometimes +// returned `st_pg_typecast` (instead of `st_no_mark_found`) when the +// next character wasn't in an enumerated delimiter list. The +// dispatcher would then advance one extra char per iteration AND keep +// re-entering the typecast handler, eating subsequent tokens until end +// of input. +static void test_digest_typecast_followed_by_clause() { + // Pre-fix output: "select count(*)" — everything after `::INT` was + // silently dropped. Post-fix the FROM/WHERE clause must survive. + std::string d = digest_query( + "SELECT COUNT(*)::INT FROM \"Inventory\" AS i WHERE i.\"TenantId\"=$1"); + ok(d.find("from") != std::string::npos && + d.find("where") != std::string::npos, + "digest #5755: typecast does not swallow following FROM/WHERE clause"); + ok(d.find("inventory") != std::string::npos || + d.find("Inventory") != std::string::npos, + "digest #5755: quoted identifier after typecast is preserved"); +} + +static void test_digest_typecast_then_identifier() { + // Bisected minimal repro: identifier directly after `::TYPENAME `. + std::string d = digest_query("SELECT a::int FROM t"); + ok(d.find("from") != std::string::npos && d.find("t") != std::string::npos, + "digest #5755: bare identifier after typecast survives"); +} + +// Regression test for the per-call `tc->started` reset: multiple casts +// in the same query must each re-enter the typecast handler cleanly. +// Without the reset, the second `::cast` would skip the `::`-skip +// branch and start parsing the type name from the wrong offset. +static void test_digest_typecast_multiple_in_same_query() { + std::string d = digest_query("SELECT 1::int, 2::text FROM t"); + ok(d.find("from") != std::string::npos && d.find("t") != std::string::npos, + "digest #5755: query with multiple typecasts preserves trailing FROM"); + // Both literals must be replaced with `?`. + ok(d.find("?") != std::string::npos, + "digest #5755: literal replacement still happens before each typecast"); +} + // ============================================================================ // 4. PgSQL-specific: Double-quoted identifiers (preserved, NOT replaced) // ============================================================================ @@ -609,7 +650,7 @@ static void test_digest_only_comment() { // ============================================================================ int main() { - plan(65); + plan(70); int rc = test_init_minimal(); ok(rc == 0, "test_init_minimal() succeeds"); @@ -633,13 +674,16 @@ int main() { test_digest_dollar_quote_in_function(); // 1 test_digest_dollar_quote_with_special_chars(); // 1 - // 3. Type casts (6 tests) + // 3. Type casts (11 tests) test_digest_typecast_simple(); // 1 test_digest_typecast_varchar(); // 1 test_digest_typecast_with_modifier(); // 1 test_digest_typecast_array(); // 1 test_digest_typecast_in_where(); // 1 test_digest_typecast_quoted(); // 1 + test_digest_typecast_followed_by_clause(); // 2 (issue #5755) + test_digest_typecast_then_identifier(); // 1 (issue #5755) + test_digest_typecast_multiple_in_same_query(); // 2 (issue #5755) // 4. Double-quoted identifiers (3 tests) test_digest_double_quoted_identifier(); // 2