From 6db52ea942b592b833bb7f8582c02fe78165a594 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?=
 <jaramago.fernandez.javier@gmail.com>
Date: Tue, 22 Apr 2025 10:06:33 +0200
Subject: [PATCH 1/4] Fix query digests for certain comments of particular size

Removed unnecessary boundary check for certain comments.
---
 lib/c_tokenizer.cpp | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/lib/c_tokenizer.cpp b/lib/c_tokenizer.cpp
index 9fc5a4c73..01d109b56 100644
--- a/lib/c_tokenizer.cpp
+++ b/lib/c_tokenizer.cpp
@@ -778,14 +778,9 @@ enum p_st process_cmnt_type_2(shared_st* shared_st) {
 	if (*shared_st->q == '#' && shared_st->q_cur_pos <= (shared_st->q_len - 2)) {
 		shared_st->q += 1;
 		shared_st->q_cur_pos += 1;
-
-		if (shared_st->q_cur_pos == (shared_st->q_len - 2)) {
-			next_state = st_no_mark_found;
-			return next_state;
-		}
 	}
 
-	if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos == shared_st->q_len - 1)) {
+	if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos >= shared_st->q_len - 1)) {
 		next_state = st_no_mark_found;
 		shared_st->prev_char = ' ';
 
@@ -818,14 +813,9 @@ enum p_st process_cmnt_type_3(shared_st* shared_st) {
 	) {
 		shared_st->q += 3;
 		shared_st->q_cur_pos += 3;
-
-		if (shared_st->q_cur_pos == (shared_st->q_len - 4)) {
-			next_state = st_no_mark_found;
-			return next_state;
-		}
 	}
 
-	if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos == shared_st->q_len - 1)) {
+	if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos >= shared_st->q_len - 1)) {
 		next_state = st_no_mark_found;
 		shared_st->prev_char = ' ';
 

From 1c416275ee9c468f0927b6b11bf9314619c4ede3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?=
 <jaramago.fernandez.javier@gmail.com>
Date: Tue, 22 Apr 2025 10:08:54 +0200
Subject: [PATCH 2/4] Add regression tests for certain query digests comments

---
 .../tests/tokenizer_payloads/regular_tokenizer_digests.hjson | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson b/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson
index 77dc7aa59..a535021fe 100644
--- a/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson
+++ b/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson
@@ -24,7 +24,10 @@
 			"# random_comment\n select 1.1   #final_comment  \n  ",
 			"-- random_comment\n select 1.1 -- final_comment\n   ",
 			"-- random_comment\n select 1.1-- final_comment   \n",
-			"-- random_comment\n select 1.1   -- final_comment  \n  "
+			// NOTE: Reg check for last `-- ` comment with '4' chars
+			"select 1 -- aaaa",
+			// NOTE: Reg check for last `# ` comment with '1' chars
+			"select 1 # a"
 		],
 		"s1": "select ?",
 		"s2": "select ?",

From 5c3a0637758d889a5c370e59a9488c5f7072734d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?=
 <jaramago.fernandez.javier@gmail.com>
Date: Thu, 1 May 2025 17:22:54 +0200
Subject: [PATCH 3/4] Fix digest comment removal for queries over
 'query_digests_max_query_length'

Due to a typo/confusion, the boundary being used for comments check was
'd_max_len' instead of 'q_len'. This prevented the correct detection of
a comment start when the query exceeded 'query_digests_max_query_length'
which determines the value for 'd_max_len'.
---
 lib/c_tokenizer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/c_tokenizer.cpp b/lib/c_tokenizer.cpp
index 01d109b56..5281be3dc 100644
--- a/lib/c_tokenizer.cpp
+++ b/lib/c_tokenizer.cpp
@@ -447,7 +447,8 @@ enum p_st get_next_st(const options* opts, struct shared_st* shared_st) {
 	// cmnt type 1 - start with '/*'
 	if(
 		// v1_crashing_payload_05
-		shared_st->q_cur_pos < (shared_st->d_max_len-1) && *shared_st->q == '/' && *(shared_st->q+1) == '*'
+		shared_st->q_cur_pos < (shared_st->q_len - 2) &&
+		*shared_st->q == '/' && *(shared_st->q+1) == '*'
 	) {
 		st = st_cmnt_type_1;
 	}
@@ -458,7 +459,7 @@ enum p_st get_next_st(const options* opts, struct shared_st* shared_st) {
 	// cmnt type 3 - start with '--'
 	else if (
 		// shared_st->query isn't over, need to check next character
-		shared_st->q_cur_pos < (shared_st->d_max_len-2) &&
+		shared_st->q_cur_pos < (shared_st->q_len - 2) &&
 		// found starting pattern '-- ' (space is required)
 		*shared_st->q == '-' && *(shared_st->q+1) == '-' && is_space_char(*(shared_st->q+2))
 	) {

From 1d7840673ab701e6ed1672bce8f296945d2468cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?=
 <jaramago.fernandez.javier@gmail.com>
Date: Thu, 1 May 2025 17:33:28 +0200
Subject: [PATCH 4/4] Several updates for
 'test_mysql_query_digests_stages-t.cpp'

- New regression payloads for fix in 5c3a0637.
- Added config option for 'null' replacement in 'mz' payloads.
- When specifying 'regular' as command line option, it's now possible to
  also specify the path to the file holding the payloads to be tested.
---
 .../test_mysql_query_digests_stages-t.cpp     | 18 +++++++-
 .../regular_tokenizer_digests.hjson           | 43 +++++++++++++++++++
 2 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/test/tap/tests/test_mysql_query_digests_stages-t.cpp b/test/tap/tests/test_mysql_query_digests_stages-t.cpp
index 910d6227d..4cfa2bb52 100644
--- a/test/tap/tests/test_mysql_query_digests_stages-t.cpp
+++ b/test/tap/tests/test_mysql_query_digests_stages-t.cpp
@@ -232,6 +232,7 @@ void process_mz_test_def(const nlohmann::json& test_def, const char* c_query, co
 			bool no_digest = true;
 			int lowercase = 0;
 			bool keep_comment = false;
+			bool replace_null = true;
 
 			if (mz_test_def.contains("digest_max_size")) {
 				digest_max_size = mz_test_def.at("digest_max_size");
@@ -255,6 +256,9 @@ void process_mz_test_def(const nlohmann::json& test_def, const char* c_query, co
 			if (mz_test_def.contains("keep_comment")) {
 				keep_comment = mz_test_def.at("keep_comment");
 			}
+			if (mz_test_def.contains("replace_null")) {
+				replace_null = mz_test_def.at("replace_null");
+			}
 
 			int backup_digest_max_length = mysql_thread___query_digests_max_query_length;
 			mysql_thread___query_digests_max_query_length = digest_max_size;
@@ -266,6 +270,8 @@ void process_mz_test_def(const nlohmann::json& test_def, const char* c_query, co
 			mysql_thread___query_digests_no_digits = replace_digits;
 			int lowercase_backup = mysql_thread___query_digests_lowercase;
 			mysql_thread___query_digests_lowercase = lowercase_backup;
+			int replace_null_backup = mysql_thread___query_digests_replace_null;
+			mysql_thread___query_digests_replace_null = replace_null;
 			int keep_comment_backup = mysql_thread___query_digests_keep_comment;
 			mysql_thread___query_digests_keep_comment = keep_comment;
 
@@ -287,6 +293,7 @@ void process_mz_test_def(const nlohmann::json& test_def, const char* c_query, co
 			mysql_thread___query_digests_no_digits = no_digits_backup;
 			mysql_thread___query_digests_lowercase = lowercase_backup;
 			mysql_thread___query_digests_keep_comment = keep_comment_backup;
+			mysql_thread___query_digests_replace_null = replace_null_backup;
 
 			if (query.size() >= QUERY_DIGEST_BUF) {
 				free(c_res);
@@ -772,9 +779,10 @@ int main(int argc, char** argv) {
 	bool exec_grouping_tests = true;
 	bool exec_regular_tests = true;
 	std::string tests_filter_str {};
+	std::string digests_file_arg {};
 
 	// check parameters for test filtering
-	if (argc == 2) {
+	if (argc >= 2) {
 		tests_filter_str = argv[1];
 
 		if (tests_filter_str.find("crashing") == std::string::npos) {
@@ -786,9 +794,15 @@ int main(int argc, char** argv) {
 		if (tests_filter_str.find("regular") == std::string::npos) {
 			exec_regular_tests = false;
 		}
+
+		if (argc == 3) {
+			digests_file_arg = argv[2];
+		}
 	}
 
-	const string digests_filepath { string(cl.workdir) + DIGESTS_TEST_FILENAME };
+	const string digests_filepath {
+		digests_file_arg.empty() ? string(cl.workdir) + DIGESTS_TEST_FILENAME : digests_file_arg
+	};
 	const string crashing_payloads { string(cl.workdir) + "tokenizer_payloads/crashing_payloads.hjson" };
 
 	uint32_t max_groups = 10;
diff --git a/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson b/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson
index a535021fe..2843350d3 100644
--- a/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson
+++ b/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson
@@ -957,6 +957,49 @@
 			}
 		]
 	},
+	{
+		// Correct compression of final comment '--' with query exceeding 'query_digests_max_query_length'
+		"q": [
+			"insert into table (field1, field2, field3, field4, field5, field6) values ('00000000000000000000000000', null, '11111111111111111111111111', 'STOP', 1745624748961, '00000000000000000000000000') on duplicate key update field4 = 'STOP', field5 = 1745624748961, field6 = '11111111111111111111111111' -- cmt: OOOOOOOOOOOOOOOOOOOOOOOOOO:NNNNNNNNNNNNNNNNNNNNNNNNNN:N"
+		],
+		"mz": [
+			{
+				"digest_max_size": 299,
+				"grouping_limit": 3,
+				"replace_null": false,
+				"groups_grouping_limit": 10,
+				"digest": "insert into table (field1,field2,field3,field4,field5,field6) values (?,null,?,?,?,?) on duplicate key update field4 = ?,field5 = ?,field6 = ?"
+			}
+		]
+	},
+	{
+		// Correct compression of final comment '--' with query exceeding 'query_digests_max_query_length'
+		"q": [
+			"SELECT '01JSQMJPDJEF'  -- cmt: foo"
+		],
+		"mz": [
+			{
+				"digest_max_size": 25,
+				"grouping_limit": 3,
+				"groups_grouping_limit": 10,
+				"digest": "SELECT ?"
+			}
+		]
+	},
+	{
+		// Correct compression of final comment '--' with query exceeding 'query_digests_max_query_length'
+		"q": [
+			"SELECT '01JSQMJPDJEF'  /* cmt: foo */"
+		],
+		"mz": [
+			{
+				"digest_max_size": 24,
+				"grouping_limit": 3,
+				"groups_grouping_limit": 10,
+				"digest": "SELECT ?"
+			}
+		]
+	},
 	{
 		// digest_corner_cases_1.hjson - Testing the compression limits of number parsing when buffer is exceeded
 		"q": [