From 8997cecf8a92297f39c15929f293dcaf5269350a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?=
 <jaramago.fernandez.javier@gmail.com>
Date: Wed, 10 Aug 2022 14:43:51 +0200
Subject: [PATCH] Fix digest processing of escaped string delimiters

---
 lib/c_tokenizer.cpp | 45 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 36 insertions(+), 9 deletions(-)

diff --git a/lib/c_tokenizer.cpp b/lib/c_tokenizer.cpp
index 40feab9b1..93ed0bc99 100644
--- a/lib/c_tokenizer.cpp
+++ b/lib/c_tokenizer.cpp
@@ -885,6 +885,8 @@ typedef struct shared_st {
 	char* res_cur_pos;
 	/* @brief Position in the return buffer prior to the start of any parsing st that isn't 'no_mark_found'. */
 	char* res_pre_pos;
+	/* @brief Preserve currently imposed 'prev_char' in **on current** char processing instead of replacing it. */
+	bool keep_prev_char;
 	/* @brief Last copied char to the result buffer. */
 	char prev_char;
 	/* @brief Decides whether or not the next char should be copy during 'stage_1'. */
@@ -921,6 +923,7 @@ typedef struct literal_string_st {
 	int delim_num;
 	/* @brief Found char delimiter found for the literal string being processed. */
 	char delim_char;
+	const char* q_start_pos;
 } literal_string_st;
 
 /**
@@ -1086,6 +1089,18 @@ enum p_st get_next_st(const struct options* opts, struct shared_st* shared_st) {
 	return st;
 }
 
+static __attribute__((always_inline)) inline
+void inc_proc_pos(shared_st* shared_st) {
+	if (shared_st->keep_prev_char == false) {
+		shared_st->prev_char = *shared_st->q;
+	} else {
+		shared_st->keep_prev_char = false;
+	}
+
+	shared_st->q++;
+	shared_st->q_cur_pos++;
+}
+
 /**
  * @brief Copy the next character and increment the current processing position.
  *
@@ -1100,9 +1115,8 @@ void copy_next_char(shared_st* shared_st, options* opts) {
 	} else {
 		*shared_st->res_cur_pos++ = !is_space_char(*shared_st->q) ? tolower(*shared_st->q) : ' ';
 	}
-	// store previously copied char and increment positions
-	shared_st->prev_char = *shared_st->q++;
-	shared_st->q_cur_pos++;
+
+	inc_proc_pos(shared_st);
 }
 
 char cur_cmd_cmnt[FIRST_COMMENT_MAX_LENGTH];
@@ -1429,6 +1443,19 @@ enum p_st process_cmnt_type_3(shared_st* shared_st) {
  *   current char until the end delimiter is found. Then replaces the previous position in the result buffer
  *   with the mark '?'.
  *
+ *  TODO: This function currently doesn't take into account if 'NO_BACKSLASH_ESCAPES' sql_mode is being used.
+ *  This can lead to 'stats_mysql_query_digest' pollution because a valid query could be received with strings
+ *  ending in '\''. With current implementation this final '\'' will be collapsed, leading to a string not
+ *  properly finding the target string delimiter. To solve this scenario the following approach could be taken:
+ *   - Add an additional parameter to 'Query_Info::begin' that propagates 'no_backslash_escapes' from
+ *     'MySQL_Session::client_myds::myconn::options' through 'Query_Info::query_parser_init'.
+ *   - Add a new parameter to 'query_parser_init' (or reuse currently unused 'flags' for this purpose).
+ *   - Add a new parameter to 'mysql_query_digest_and_first_comment_2' to propagate this flags.
+ *   - Add a new field into the 'options' struct defined in this file for holding such flags.
+ *   - Pass 'options' already received by 'stage_1_parsing' into this function and make use of it for deciding
+ *     whether to ignore the processing of chars within the string when are preceded by '\'.
+ *  This is just a proposal and a future implementation may be subject to change.
+ *
  * @param shared_st Shared state used to continue the query processing.
  * @param str_st The literal string parsing state, holds the information so far found about the state.
  *
@@ -1443,6 +1470,7 @@ enum p_st process_literal_string(shared_st* shared_st, literal_string_st* str_st
 	// process the first delimiter
 	if (str_st->delim_num == 0) {
 		// store found delimiter
+		str_st->q_start_pos = shared_st->q;
 		str_st->delim_char = *shared_st->q;
 		str_st->delim_num = 1;
 
@@ -1451,7 +1479,7 @@ enum p_st process_literal_string(shared_st* shared_st, literal_string_st* str_st
 	}
 
 	// need to be ignored case
-	if(shared_st->res_cur_pos > shared_st->res_pre_pos + SIZECHAR)
+	if(shared_st->q > str_st->q_start_pos + SIZECHAR)
 	{
 		if(
 			(shared_st->prev_char == '\\' && *shared_st->q == '\\') || // to process '\\\\', '\\'
@@ -1459,10 +1487,10 @@ enum p_st process_literal_string(shared_st* shared_st, literal_string_st* str_st
 			(shared_st->prev_char == str_st->delim_char && *shared_st->q == str_st->delim_char) // to process ''''
 		)
 		{
+			shared_st->keep_prev_char = true;
 			shared_st->prev_char = 'X';
-			shared_st->q++;
-			shared_st->q_cur_pos++;
 
+			// NOTE: Don't increment the position in query buffer. See 'stage_1_parsing' doc.
 			return next_state;
 		}
 	}
@@ -1490,6 +1518,7 @@ enum p_st process_literal_string(shared_st* shared_st, literal_string_st* str_st
 		// reinit the string literal state
 		str_st->delim_char = 0;
 		str_st->delim_num = 0;
+		str_st->q_start_pos = 0;
 
 		// update the shared state
 		shared_st->prev_char = str_st->delim_char;
@@ -1896,9 +1925,7 @@ void stage_1_parsing(shared_st* shared_st, stage_1_st* stage_1_st, options* opts
 			if (shared_st->copy_next_char) {
 				copy_next_char(shared_st, opts);
 			} else {
-				// if we do not copy we skip the next char, but copy it to `prev_char`
-				shared_st->prev_char = *shared_st->q++;
-				shared_st->q_cur_pos++;
+				inc_proc_pos(shared_st);
 			}
 		}
 	}