Add PostgreSQL dialect support: dollar-quoted strings, identifier quoting, and dialect-specific comment rules

This change introduces PostgreSQL-aware tokenization by adding support for dollar-quoted strings, PostgreSQL’s double-quoted identifiers, and its comment rules. The tokenizer now correctly parses $$…$$ and $tag$…$tag$, treats " as an identifier delimiter in PostgreSQL, disables MySQL-only # comments, and accepts -- as a comment starter without requiring a trailing space. All new behavior is fully isolated behind the dialect flag to avoid impacting MySQL parsing. Add PostgreSQL dollar-quoted strings * New parser state: st_dollar_quote_string. * Recognizes $$ … $$ and $tag$ … $tag$ sequences. * Tracks opening tag and searches for matching terminator. * Normalizes entire literal to ?. * Integrated into get_next_st() and stage_1_parsing().
6 months ago · 285fb1b4e1
parent 7205f424a2
commit 285fb1b4e1
3 changed files with 188 additions and 16 deletions
--- a/include/c_tokenizer.h
+++ b/include/c_tokenizer.h
@ -23,6 +23,11 @@ tokenizer_t;

 enum { TOKENIZER_EMPTIES_OK, TOKENIZER_NO_EMPTIES };

+enum sql_dialect {
+	DIALECT_MYSQL,
+	DIALECT_PG
+};
+
 /**
 * @brief Struct for holding all the configuration options used for query digests generation.
 */
@ -34,6 +39,7 @@ typedef struct _options {
 	int grouping_limit;
 	int groups_grouping_limit;
 	int max_query_length;
+	sql_dialect dialect;
 } options;


--- a/lib/Query_Processor.cpp
+++ b/lib/Query_Processor.cpp
@ -1815,6 +1815,7 @@ void Query_Processor<QP_DERIVED>::query_parser_init(SQP_par_t *qp, const char *q
 		opts.groups_grouping_limit = GET_THREAD_VARIABLE(query_digests_groups_grouping_limit);
 		opts.keep_comment = GET_THREAD_VARIABLE(query_digests_keep_comment);
 		opts.max_query_length = GET_THREAD_VARIABLE(query_digests_max_query_length);
+		opts.dialect = (std::is_same_v<QP_DERIVED, MySQL_Query_Processor>) ? DIALECT_MYSQL : DIALECT_PG;

 		qp->digest_text=query_digest_and_first_comment_2(query, query_length, &qp->first_comment, 
 			((query_length < QUERY_DIGEST_BUF) ? qp->buf : NULL), &opts);
--- a/lib/c_tokenizer.cpp
+++ b/lib/c_tokenizer.cpp
@ -243,6 +243,7 @@ static inline void get_mysql_options(options* opts) {
 	opts->groups_grouping_limit = mysql_thread___query_digests_groups_grouping_limit;
 	opts->keep_comment = mysql_thread___query_digests_keep_comment;
 	opts->max_query_length = mysql_thread___query_digests_max_query_length;
+	opts->dialect = DIALECT_MYSQL;
 }

 /**
@ -255,7 +256,8 @@ enum p_st {
 	st_cmnt_type_3 = 3,
 	st_literal_string = 4,
 	st_literal_number = 5,
-	st_replace_null = 6
+	st_replace_null = 6,
+	st_dollar_quote_string = 7
 };

 /**
@ -333,6 +335,15 @@ typedef struct literal_digit_st {
 	char* start_pos;
 } literal_digit_st;

+/**
+ * State used for parsing 'literal strings' values, i.e: 'foo', "bar", etc..
+ * 
+ */
+typedef struct dollar_quote_string_st {
+	const char* tag_start;  // pointer to start of $tag$
+	size_t tag_len;       // length of tag (can be 0 for $$)
+} dollar_quote_string_st;
+
 /**
 * @brief Created for an alternative implementation of NULL parsing.
 *   Currently unused. TODO: Remove.
@ -348,6 +359,7 @@ typedef struct stage_1_st {
 	struct cmnt_type_1_st cmnt_type_1_st;
 	struct literal_string_st literal_str_st;
 	struct literal_digit_st literal_digit_st;
+	struct dollar_quote_string_st dollar_quote_str_st;
 	/* @brief Holds the previous iteration parsing ending position. */
 	char* pre_it_pos;
 	/**
@ -452,27 +464,60 @@ enum p_st get_next_st(const options* opts, struct shared_st* shared_st) {
 	) {
 		st = st_cmnt_type_1;
 	}
-	// cmnt type 2 - start with '#'
-	else if(*shared_st->q == '#') {
+	// cmnt type 2 - #  (only for MySQL/MariaDB)
+	else if (opts->dialect == DIALECT_MYSQL && *shared_st->q == '#') {
 		st = st_cmnt_type_2;
 	}
-	// cmnt type 3 - start with '--'
-	else if (
-		// shared_st->query isn't over, need to check next character
-		shared_st->q_cur_pos < (shared_st->q_len - 2) &&
-		// found starting pattern '-- ' (space is required)
-		*shared_st->q == '-' && *(shared_st->q+1) == '-' && is_space_char(*(shared_st->q+2))
-	) {
-		if (prev_char != '-') {
-			st = st_cmnt_type_3;
+	// cmnt type 3 - -- ... (dialect-dependent)
+	else if (*shared_st->q == '-' && shared_st->q_cur_pos < (shared_st->q_len - 1) && 
+		*(shared_st->q + 1) == '-')
+	{
+		if (opts->dialect == DIALECT_PG) {
+			// PG: -- starts comment regardless of following space
+			if (prev_char != '-') { st = st_cmnt_type_3; }
+			else if (shared_st->q_cur_pos == 0) { st = st_cmnt_type_3; }
+		} else { // MySQL behavior: require a whitespace/control after --
+			if (shared_st->q_cur_pos < (shared_st->q_len - 2) &&
+				is_space_char(*(shared_st->q + 2)))
+			{
+				if (prev_char != '-') { st = st_cmnt_type_3; }
+				else if (shared_st->q_cur_pos == 0) { st = st_cmnt_type_3; }
+			}
 		}
-		else if (shared_st->q_cur_pos == 0) {
-			st = st_cmnt_type_3;
+	}
+	// dollar-quoted string start (Postgres: $tag$ or $$)
+	else if (opts->dialect == DIALECT_PG && *shared_st->q == '$') {
+		// Check for a PostgreSQL dollar-quoted string.
+		// Format: $tag$ ... $tag$
+		//
+		// The tag may be empty or consist only of letters, digits, or underscores.
+		// Example valid tags: $$, $foo$, $TAG_123$
+		//
+		// Here we scan characters after the first '$' to verify that:
+		//   1. All tag characters are [A-Za-z0-9_], and
+		//   2. The tag is terminated by another '$'
+		//
+		// If so, we treat it as the start of a dollar-quoted string literal.
+		const char* p = shared_st->q + 1;
+		while (p < shared_st->q + (shared_st->q_len - shared_st->q_cur_pos) &&
+			((*p >= 'A' && *p <= 'Z') || (*p >= 'a' && *p <= 'z') || (*p >= '0' && *p <= '9') || *p == '_')) {
+			p++;
+		}
+		if (p < shared_st->q + (shared_st->q_len - shared_st->q_cur_pos) && *p == '$') {
+			st = st_dollar_quote_string; // add new enum state for dollar-quoted string
 		}
 	}
-	// string - start with '
-	else if (*shared_st->q == '\'' || *shared_st->q == '"') {
+	// string - single-quote is string in both; double-quote depends on dialect
+	else if (*shared_st->q == '\'') {
 		st = st_literal_string;
+	} else if (*shared_st->q == '"') {
+		if (opts->dialect == DIALECT_PG) {
+			// treat as identifier, not string
+		} else {
+			// MySQL: double quote may be string (unless ANSI_QUOTES enabled)
+			// FIXME: Add ANSI_QUOTES support
+			st = st_literal_string;
+		}
 	}
 	// may be digit - start with digit
 	else if (is_token_char(prev_char) && is_digit_char(*shared_st->q)) {
@ -924,6 +969,118 @@ enum p_st process_literal_string(shared_st* shared_st, literal_string_st* str_st
 	return next_state;
 }

+/**
+ * @brief Handles the processing state 'st_dollar_quote_string'.
+ *
+ * @param shared_st Shared state used to continue the query processing.
+ * @param dq_st The dollar-quoted string parsing state, holds the information so far found about the state.
+ *
+ * @return The next processing state, it could be either:
+ *   - 'st_dollar_quote_string' if the dollar-quoted string hasn't yet completed to be parsed.
+ *   - 'st_no_mark_found' if the dollar-quoted string has completed to be parsed.
+ */
+static __attribute__((always_inline)) inline
+enum p_st process_dollar_quote_string(shared_st* shared_st, dollar_quote_string_st* dq_st)
+{
+	enum p_st next_state = st_dollar_quote_string;
+
+	// Number of bytes remaining in the input buffer
+	size_t remaining = shared_st->q_len - shared_st->q_cur_pos;
+
+	// ============================================================
+	// PHASE 1 — Detect and initialize the opening $tag$
+	// ============================================================
+	if (dq_st->tag_start == NULL) {
+
+		// At least "$$" is needed to form a valid opening delimiter
+		if (remaining < 2) {		
+			return st_no_mark_found;
+		}
+
+		// Start scanning after the first '$' to read the tag
+		const char* p = shared_st->q + 1; // skip first $
+
+		// Read tag characters until another '$' or buffer end
+		// Valid characters: [A-Za-z0-9_]
+		while ((size_t)(p - shared_st->q) < remaining && *p != '$') {
+			char c = *p;
+			if (!((c >= 'a' && c <= 'z') || 
+				  (c >= 'A' && c <= 'Z') || 
+				  (c >= '0' && c <= '9') ||
+				   c == '_'))
+			{
+				// Illegal tag character -> this is not a dollar-quote
+				return st_no_mark_found;
+			}
+			p++;
+		}
+
+		// If we reached end-of-buffer or didn't find a closing '$', it's not valid
+		if ((size_t)(p - shared_st->q) >= remaining || *p != '$') {
+			return st_no_mark_found;
+		}
+
+		// Store tag metadata:
+		// Example: $TAG$ -> tag_start points to 'T', tag_len = 3
+		dq_st->tag_start = shared_st->q + 1;                  // first char of tag
+		dq_st->tag_len = (int)(p - dq_st->tag_start);         // 0 for $$
+
+		// Check that skipping "$tag$" will not exceed buffer bounds
+		if (shared_st->q_cur_pos + dq_st->tag_len + 2 > shared_st->q_len)
+			return st_no_mark_found;
+
+		// Advance input pointers past the opening delimiter
+		shared_st->q += dq_st->tag_len + 2;
+		shared_st->q_cur_pos += dq_st->tag_len + 2;
+
+		return next_state; // Continue scanning inside the string
+	}
+
+	// ============================================================
+	// PHASE 2 — Inside the dollar-quoted string
+	// Look for the closing delimiter $tag$
+	// ============================================================
+	while (shared_st->q_cur_pos < shared_st->q_len) {
+		remaining = shared_st->q_len - shared_st->q_cur_pos;
+
+		// Check if enough bytes remain to match the closing delimiter
+		if (remaining >= (size_t)(dq_st->tag_len + 2)) {
+
+			// Validate: '$' + tag + '$'
+			if (*shared_st->q == '$' &&
+				memcmp(shared_st->q + 1, dq_st->tag_start, dq_st->tag_len) == 0 &&
+				*(shared_st->q + 1 + dq_st->tag_len) == '$')
+			{
+				// Found the closing delimiter
+
+				// Replace the entire dollar-quoted string with a single '?'
+				shared_st->res_cur_pos = shared_st->res_pre_pos;
+				*shared_st->res_cur_pos++ = '?';
+
+				// Skip past the closing delimiter
+				shared_st->q += dq_st->tag_len + 2;
+				shared_st->q_cur_pos += dq_st->tag_len + 2;
+
+				// Reset stored tag so the next string can be detected
+				dq_st->tag_start = NULL;
+				dq_st->tag_len = 0;
+
+				return st_no_mark_found;
+			}
+		} else {
+			// Not enough bytes left to form a closing delimiter -> safe exit
+			return st_no_mark_found;
+		}
+
+		// No delimiter found here -> consume one character and continue
+		shared_st->q++;
+		shared_st->q_cur_pos++;
+	}
+
+	// Reached end-of-buffer while still inside the string
+	return next_state;
+}
+
 /**
 * @brief Handles the processing state 'st_literal_digit'.
 *
@ -1194,6 +1351,7 @@ void stage_1_parsing(shared_st* shared_st, stage_1_st* stage_1_st, const options
 	cmnt_type_1_st* const cmnt_type_1_st = &stage_1_st->cmnt_type_1_st;
 	literal_string_st* const literal_str_st = &stage_1_st->literal_str_st;
 	literal_digit_st* const literal_digit_st = &stage_1_st->literal_digit_st;
+	dollar_quote_string_st* const dollar_quote_str_st = &stage_1_st->dollar_quote_str_st;

 	// starting state can belong to a previous iteration
 	enum p_st cur_st = shared_st->st;
@ -1294,6 +1452,13 @@ void stage_1_parsing(shared_st* shared_st, stage_1_st* stage_1_st, const options
 					shared_st->copy_next_char = 1;
 					continue;
 				}
+			} else if (cur_st == st_dollar_quote_string) {
+				shared_st->copy_next_char = 0;
+				cur_st = process_dollar_quote_string(shared_st, dollar_quote_str_st);
+				if (cur_st == st_no_mark_found) {
+					shared_st->copy_next_char = 1;
+					continue;
+				}
 			} else if (cur_st == st_literal_number) {
 				shared_st->copy_next_char = 1;
 				cur_st = process_literal_digit(shared_st, literal_digit_st, opts);