From 6db52ea942b592b833bb7f8582c02fe78165a594 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?= Date: Tue, 22 Apr 2025 10:06:33 +0200 Subject: [PATCH 1/4] Fix query digests for certain comments of particular size Removed unnecessary boundary check for certain comments. --- lib/c_tokenizer.cpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/lib/c_tokenizer.cpp b/lib/c_tokenizer.cpp index 9fc5a4c73..01d109b56 100644 --- a/lib/c_tokenizer.cpp +++ b/lib/c_tokenizer.cpp @@ -778,14 +778,9 @@ enum p_st process_cmnt_type_2(shared_st* shared_st) { if (*shared_st->q == '#' && shared_st->q_cur_pos <= (shared_st->q_len - 2)) { shared_st->q += 1; shared_st->q_cur_pos += 1; - - if (shared_st->q_cur_pos == (shared_st->q_len - 2)) { - next_state = st_no_mark_found; - return next_state; - } } - if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos == shared_st->q_len - 1)) { + if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos >= shared_st->q_len - 1)) { next_state = st_no_mark_found; shared_st->prev_char = ' '; @@ -818,14 +813,9 @@ enum p_st process_cmnt_type_3(shared_st* shared_st) { ) { shared_st->q += 3; shared_st->q_cur_pos += 3; - - if (shared_st->q_cur_pos == (shared_st->q_len - 4)) { - next_state = st_no_mark_found; - return next_state; - } } - if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos == shared_st->q_len - 1)) { + if (*shared_st->q == '\n' || *shared_st->q == '\r' || (shared_st->q_cur_pos >= shared_st->q_len - 1)) { next_state = st_no_mark_found; shared_st->prev_char = ' '; From 1c416275ee9c468f0927b6b11bf9314619c4ede3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?= Date: Tue, 22 Apr 2025 10:08:54 +0200 Subject: [PATCH 2/4] Add regression tests for certain query digests comments --- .../tests/tokenizer_payloads/regular_tokenizer_digests.hjson | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson b/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson index 77dc7aa59..a535021fe 100644 --- a/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson +++ b/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson @@ -24,7 +24,10 @@ "# random_comment\n select 1.1 #final_comment \n ", "-- random_comment\n select 1.1 -- final_comment\n ", "-- random_comment\n select 1.1-- final_comment \n", - "-- random_comment\n select 1.1 -- final_comment \n " + // NOTE: Reg check for last `-- ` comment with '4' chars + "select 1 -- aaaa", + // NOTE: Reg check for last `# ` comment with '1' chars + "select 1 # a" ], "s1": "select ?", "s2": "select ?", From 5c3a0637758d889a5c370e59a9488c5f7072734d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?= Date: Thu, 1 May 2025 17:22:54 +0200 Subject: [PATCH 3/4] Fix digest comment removal for queries over 'query_digests_max_query_length' Due to a typo/confusion, the boundary being used for comments check was 'd_max_len' instead of 'q_len'. This prevented the correct detection of a comment start when the query exceeded 'query_digests_max_query_length' which determines the value for 'd_max_len'. --- lib/c_tokenizer.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/c_tokenizer.cpp b/lib/c_tokenizer.cpp index 01d109b56..5281be3dc 100644 --- a/lib/c_tokenizer.cpp +++ b/lib/c_tokenizer.cpp @@ -447,7 +447,8 @@ enum p_st get_next_st(const options* opts, struct shared_st* shared_st) { // cmnt type 1 - start with '/*' if( // v1_crashing_payload_05 - shared_st->q_cur_pos < (shared_st->d_max_len-1) && *shared_st->q == '/' && *(shared_st->q+1) == '*' + shared_st->q_cur_pos < (shared_st->q_len - 2) && + *shared_st->q == '/' && *(shared_st->q+1) == '*' ) { st = st_cmnt_type_1; } @@ -458,7 +459,7 @@ enum p_st get_next_st(const options* opts, struct shared_st* shared_st) { // cmnt type 3 - start with '--' else if ( // shared_st->query isn't over, need to check next character - shared_st->q_cur_pos < (shared_st->d_max_len-2) && + shared_st->q_cur_pos < (shared_st->q_len - 2) && // found starting pattern '-- ' (space is required) *shared_st->q == '-' && *(shared_st->q+1) == '-' && is_space_char(*(shared_st->q+2)) ) { From 1d7840673ab701e6ed1672bce8f296945d2468cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?= Date: Thu, 1 May 2025 17:33:28 +0200 Subject: [PATCH 4/4] Several updates for 'test_mysql_query_digests_stages-t.cpp' - New regression payloads for fix in 5c3a0637. - Added config option for 'null' replacement in 'mz' payloads. - When specifying 'regular' as command line option, it's now possible to also specify the path to the file holding the payloads to be tested. --- .../test_mysql_query_digests_stages-t.cpp | 18 +++++++- .../regular_tokenizer_digests.hjson | 43 +++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/test/tap/tests/test_mysql_query_digests_stages-t.cpp b/test/tap/tests/test_mysql_query_digests_stages-t.cpp index 910d6227d..4cfa2bb52 100644 --- a/test/tap/tests/test_mysql_query_digests_stages-t.cpp +++ b/test/tap/tests/test_mysql_query_digests_stages-t.cpp @@ -232,6 +232,7 @@ void process_mz_test_def(const nlohmann::json& test_def, const char* c_query, co bool no_digest = true; int lowercase = 0; bool keep_comment = false; + bool replace_null = true; if (mz_test_def.contains("digest_max_size")) { digest_max_size = mz_test_def.at("digest_max_size"); @@ -255,6 +256,9 @@ void process_mz_test_def(const nlohmann::json& test_def, const char* c_query, co if (mz_test_def.contains("keep_comment")) { keep_comment = mz_test_def.at("keep_comment"); } + if (mz_test_def.contains("replace_null")) { + replace_null = mz_test_def.at("replace_null"); + } int backup_digest_max_length = mysql_thread___query_digests_max_query_length; mysql_thread___query_digests_max_query_length = digest_max_size; @@ -266,6 +270,8 @@ void process_mz_test_def(const nlohmann::json& test_def, const char* c_query, co mysql_thread___query_digests_no_digits = replace_digits; int lowercase_backup = mysql_thread___query_digests_lowercase; mysql_thread___query_digests_lowercase = lowercase_backup; + int replace_null_backup = mysql_thread___query_digests_replace_null; + mysql_thread___query_digests_replace_null = replace_null; int keep_comment_backup = mysql_thread___query_digests_keep_comment; mysql_thread___query_digests_keep_comment = keep_comment; @@ -287,6 +293,7 @@ void process_mz_test_def(const nlohmann::json& test_def, const char* c_query, co mysql_thread___query_digests_no_digits = no_digits_backup; mysql_thread___query_digests_lowercase = lowercase_backup; mysql_thread___query_digests_keep_comment = keep_comment_backup; + mysql_thread___query_digests_replace_null = replace_null_backup; if (query.size() >= QUERY_DIGEST_BUF) { free(c_res); @@ -772,9 +779,10 @@ int main(int argc, char** argv) { bool exec_grouping_tests = true; bool exec_regular_tests = true; std::string tests_filter_str {}; + std::string digests_file_arg {}; // check parameters for test filtering - if (argc == 2) { + if (argc >= 2) { tests_filter_str = argv[1]; if (tests_filter_str.find("crashing") == std::string::npos) { @@ -786,9 +794,15 @@ int main(int argc, char** argv) { if (tests_filter_str.find("regular") == std::string::npos) { exec_regular_tests = false; } + + if (argc == 3) { + digests_file_arg = argv[2]; + } } - const string digests_filepath { string(cl.workdir) + DIGESTS_TEST_FILENAME }; + const string digests_filepath { + digests_file_arg.empty() ? string(cl.workdir) + DIGESTS_TEST_FILENAME : digests_file_arg + }; const string crashing_payloads { string(cl.workdir) + "tokenizer_payloads/crashing_payloads.hjson" }; uint32_t max_groups = 10; diff --git a/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson b/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson index a535021fe..2843350d3 100644 --- a/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson +++ b/test/tap/tests/tokenizer_payloads/regular_tokenizer_digests.hjson @@ -957,6 +957,49 @@ } ] }, + { + // Correct compression of final comment '--' with query exceeding 'query_digests_max_query_length' + "q": [ + "insert into table (field1, field2, field3, field4, field5, field6) values ('00000000000000000000000000', null, '11111111111111111111111111', 'STOP', 1745624748961, '00000000000000000000000000') on duplicate key update field4 = 'STOP', field5 = 1745624748961, field6 = '11111111111111111111111111' -- cmt: OOOOOOOOOOOOOOOOOOOOOOOOOO:NNNNNNNNNNNNNNNNNNNNNNNNNN:N" + ], + "mz": [ + { + "digest_max_size": 299, + "grouping_limit": 3, + "replace_null": false, + "groups_grouping_limit": 10, + "digest": "insert into table (field1,field2,field3,field4,field5,field6) values (?,null,?,?,?,?) on duplicate key update field4 = ?,field5 = ?,field6 = ?" + } + ] + }, + { + // Correct compression of final comment '--' with query exceeding 'query_digests_max_query_length' + "q": [ + "SELECT '01JSQMJPDJEF' -- cmt: foo" + ], + "mz": [ + { + "digest_max_size": 25, + "grouping_limit": 3, + "groups_grouping_limit": 10, + "digest": "SELECT ?" + } + ] + }, + { + // Correct compression of final comment '--' with query exceeding 'query_digests_max_query_length' + "q": [ + "SELECT '01JSQMJPDJEF' /* cmt: foo */" + ], + "mz": [ + { + "digest_max_size": 24, + "grouping_limit": 3, + "groups_grouping_limit": 10, + "digest": "SELECT ?" + } + ] + }, { // digest_corner_cases_1.hjson - Testing the compression limits of number parsing when buffer is exceeded "q": [