proxysql/test/tap/tests/unit/c_tokenizer_unit-t.cpp

/**
 * @file c_tokenizer_unit-t.cpp
 * @brief Unit tests for the SQL tokenizer/digest functions in lib/c_tokenizer.cpp.
 *
 * Tests the pure parsing functions:
 *   - mysql_query_digest_and_first_comment() / mysql_query_digest_and_first_comment_2()
 *   - mysql_query_digest_first_stage()
 *   - mysql_query_strip_comments()
 *   - tokenizer(), tokenize(), free_tokenizer()
 *   - c_split_2()
 *
 * These functions have no global state dependencies beyond the __thread
 * variables defined by the test harness (test_globals.o).
 */

#include "tap.h"
#include "test_globals.h"
#include "test_init.h"

#include "proxysql.h"
#include "c_tokenizer.h"

#include <cstring>
#include <cstdlib>
#include <string>

// Thread-local variables used by mysql_query_digest_first_stage and
// mysql_query_digest_and_first_comment_2 (which call get_mysql_options internally).
extern __thread int  mysql_thread___query_digests_max_query_length;
extern __thread bool mysql_thread___query_digests_lowercase;
extern __thread bool mysql_thread___query_digests_replace_null;
extern __thread bool mysql_thread___query_digests_no_digits;
extern __thread int  mysql_thread___query_digests_grouping_limit;
extern __thread int  mysql_thread___query_digests_groups_grouping_limit;
extern __thread bool mysql_thread___query_digests_keep_comment;

/**
 * @brief Set thread-local digest options to sensible defaults for testing.
 */
static void setup_digest_defaults() {
	mysql_thread___query_digests_max_query_length = 2048;
	mysql_thread___query_digests_lowercase = true;
	mysql_thread___query_digests_replace_null = true;
	mysql_thread___query_digests_no_digits = true;
	mysql_thread___query_digests_grouping_limit = 3;
	mysql_thread___query_digests_groups_grouping_limit = 0;
	mysql_thread___query_digests_keep_comment = false;
}

// ============================================================================
// Helper: call mysql_query_digest_and_first_comment with explicit options
// ============================================================================

/**
 * @brief Convenience wrapper that digests a query using the explicit options API.
 *
 * Returns the digest as a std::string. Uses a stack buffer to avoid malloc.
 */
static std::string digest_query(const char* query, bool lowercase = true,
                                 bool replace_null = true, bool no_digits = true,
                                 int grouping_limit = 3, int groups_grouping_limit = 0,
                                 bool keep_comment = false, int max_query_length = 2048) {
	char buf[2048];
	memset(buf, 0, sizeof(buf));
	char* first_comment = nullptr;

	options opts;
	opts.lowercase = lowercase;
	opts.replace_null = replace_null;
	opts.replace_number = no_digits;
	opts.grouping_limit = grouping_limit;
	opts.groups_grouping_limit = groups_grouping_limit;
	opts.keep_comment = keep_comment;
	opts.max_query_length = max_query_length;

	int q_len = (int)strlen(query);
	char* result = mysql_query_digest_and_first_comment(query, q_len, &first_comment, buf, &opts);

	std::string ret(result);
	if (first_comment) free(first_comment);
	return ret;
}

/**
 * @brief Digest query via the thread-local wrapper (mysql_query_digest_and_first_comment_2).
 */
static std::string digest_query_2(const char* query) {
	char buf[2048];
	memset(buf, 0, sizeof(buf));
	char* first_comment = nullptr;

	int q_len = (int)strlen(query);
	char* result = mysql_query_digest_and_first_comment_2(query, q_len, &first_comment, buf);

	std::string ret(result);
	if (first_comment) free(first_comment);
	return ret;
}

/**
 * @brief First-stage digest via the thread-local wrapper.
 */
static std::string digest_first_stage(const char* query) {
	char buf[2048];
	memset(buf, 0, sizeof(buf));
	char* first_comment = nullptr;

	int q_len = (int)strlen(query);
	char* result = mysql_query_digest_first_stage(query, q_len, &first_comment, buf);

	std::string ret(result);
	if (first_comment) free(first_comment);
	return ret;
}

// ============================================================================
// 1. tokenizer / tokenize / free_tokenizer
// ============================================================================

static void test_tokenizer_basic() {
	tokenizer_t tok;
	tokenizer(&tok, "hello,world,foo", ",", TOKENIZER_NO_EMPTIES);

	const char* t1 = tokenize(&tok);
	ok(t1 != nullptr && strcmp(t1, "hello") == 0,
		"tokenizer: first token is 'hello'");

	const char* t2 = tokenize(&tok);
	ok(t2 != nullptr && strcmp(t2, "world") == 0,
		"tokenizer: second token is 'world'");

	const char* t3 = tokenize(&tok);
	ok(t3 != nullptr && strcmp(t3, "foo") == 0,
		"tokenizer: third token is 'foo'");

	const char* t4 = tokenize(&tok);
	ok(t4 == nullptr, "tokenizer: returns NULL after last token");
}

static void test_tokenizer_empties_ok() {
	tokenizer_t tok;
	tokenizer(&tok, "a,,b", ",", TOKENIZER_EMPTIES_OK);

	const char* t1 = tokenize(&tok);
	ok(t1 != nullptr && strcmp(t1, "a") == 0,
		"tokenizer empties_ok: first token is 'a'");

	const char* t2 = tokenize(&tok);
	ok(t2 != nullptr && strcmp(t2, "") == 0,
		"tokenizer empties_ok: second token is empty string");

	const char* t3 = tokenize(&tok);
	ok(t3 != nullptr && strcmp(t3, "b") == 0,
		"tokenizer empties_ok: third token is 'b'");

	const char* t4 = tokenize(&tok);
	ok(t4 == nullptr, "tokenizer empties_ok: returns NULL after last token");
}

static void test_tokenizer_no_empties() {
	tokenizer_t tok;
	tokenizer(&tok, "a,,b", ",", TOKENIZER_NO_EMPTIES);

	const char* t1 = tokenize(&tok);
	ok(t1 != nullptr && strcmp(t1, "a") == 0,
		"tokenizer no_empties: first token is 'a'");

	const char* t2 = tokenize(&tok);
	ok(t2 != nullptr && strcmp(t2, "b") == 0,
		"tokenizer no_empties: second token is 'b' (empties skipped)");

	const char* t3 = tokenize(&tok);
	ok(t3 == nullptr, "tokenizer no_empties: returns NULL after last token");
}

static void test_tokenizer_empty_string() {
	tokenizer_t tok;
	tokenizer(&tok, "", ",", TOKENIZER_NO_EMPTIES);

	const char* t1 = tokenize(&tok);
	ok(t1 == nullptr, "tokenizer empty string: returns NULL immediately");
}

static void test_tokenizer_null_input() {
	tokenizer_t tok;
	tokenizer(&tok, nullptr, ",", TOKENIZER_NO_EMPTIES);

	const char* t1 = tokenize(&tok);
	ok(t1 == nullptr, "tokenizer NULL input: returns NULL immediately");
}

static void test_tokenizer_no_delimiter_found() {
	tokenizer_t tok;
	tokenizer(&tok, "hello", ",", TOKENIZER_NO_EMPTIES);

	const char* t1 = tokenize(&tok);
	ok(t1 != nullptr && strcmp(t1, "hello") == 0,
		"tokenizer no delimiter: returns whole string as single token");

	const char* t2 = tokenize(&tok);
	ok(t2 == nullptr, "tokenizer no delimiter: returns NULL after single token");
}

static void test_tokenizer_long_string() {
	// Exceeds PROXYSQL_TOKENIZER_BUFFSIZE (128) to test strdup path
	std::string long_str(200, 'x');
	long_str[100] = ',';

	tokenizer_t tok;
	tokenizer(&tok, long_str.c_str(), ",", TOKENIZER_NO_EMPTIES);

	const char* t1 = tokenize(&tok);
	ok(t1 != nullptr && strlen(t1) == 100,
		"tokenizer long string: first token has correct length");

	const char* t2 = tokenize(&tok);
	ok(t2 != nullptr && strlen(t2) == 99,
		"tokenizer long string: second token has correct length");

	const char* t3 = tokenize(&tok);
	ok(t3 == nullptr, "tokenizer long string: returns NULL after last token");
}

// ============================================================================
// 2. c_split_2
// ============================================================================

static void test_c_split_2_basic() {
	char *out1 = nullptr, *out2 = nullptr;
	c_split_2("key=value", "=", &out1, &out2);

	ok(out1 != nullptr && strcmp(out1, "key") == 0,
		"c_split_2: first part is 'key'");
	ok(out2 != nullptr && strcmp(out2, "value") == 0,
		"c_split_2: second part is 'value'");

	free(out1);
	free(out2);
}

static void test_c_split_2_no_delimiter() {
	char *out1 = nullptr, *out2 = nullptr;
	c_split_2("nodelmiter", "=", &out1, &out2);

	ok(out1 != nullptr && strcmp(out1, "nodelmiter") == 0,
		"c_split_2 no delim: first part is whole string");
	ok(out2 != nullptr && strcmp(out2, "") == 0,
		"c_split_2 no delim: second part is empty string");

	free(out1);
	free(out2);
}

static void test_c_split_2_empty_string() {
	char *out1 = nullptr, *out2 = nullptr;
	c_split_2("", "=", &out1, &out2);

	ok(out1 != nullptr && strcmp(out1, "") == 0,
		"c_split_2 empty: first part is empty string");
	ok(out2 != nullptr && strcmp(out2, "") == 0,
		"c_split_2 empty: second part is empty string");

	free(out1);
	free(out2);
}

static void test_c_split_2_multiple_delimiters() {
	char *out1 = nullptr, *out2 = nullptr;
	c_split_2("a=b=c", "=", &out1, &out2);

	ok(out1 != nullptr && strcmp(out1, "a") == 0,
		"c_split_2 multi delim: first part is 'a'");
	ok(out2 != nullptr && strcmp(out2, "b") == 0,
		"c_split_2 multi delim: second part is 'b' (third ignored)");

	free(out1);
	free(out2);
}

// ============================================================================
// 3. mysql_query_digest_and_first_comment — number/literal replacement
// ============================================================================

static void test_digest_simple_select() {
	std::string d = digest_query("SELECT * FROM users WHERE id=1");
	ok(d == "select * from users where id=?",
		"digest: simple SELECT with integer literal replaced");
}

static void test_digest_insert_values() {
	std::string d = digest_query("INSERT INTO orders VALUES(1,'test',3.14)");
	ok(d == "insert into orders values(?,?,?)",
		"digest: INSERT with multiple literal types replaced");
}

static void test_digest_string_literals() {
	std::string d = digest_query("SELECT * FROM t WHERE name='alice'");
	ok(d == "select * from t where name=?",
		"digest: string literal replaced with ?");
}

static void test_digest_double_quoted_string() {
	std::string d = digest_query("SELECT * FROM t WHERE name=\"bob\"");
	ok(d == "select * from t where name=?",
		"digest: double-quoted string literal replaced with ?");
}

static void test_digest_float_literal() {
	std::string d = digest_query("SELECT * FROM t WHERE val=3.14159");
	ok(d == "select * from t where val=?",
		"digest: float literal replaced with ?");
}

static void test_digest_negative_number() {
	std::string d = digest_query("SELECT * FROM t WHERE val=-42");
	ok(d == "select * from t where val=?",
		"digest: negative number replaced with ?");
}

static void test_digest_hex_literal() {
	std::string d = digest_query("SELECT * FROM t WHERE id=0xFF");
	ok(d == "select * from t where id=?",
		"digest: hex literal replaced with ?");
}

static void test_digest_null_replacement() {
	std::string d = digest_query("SELECT * FROM t WHERE val=NULL");
	ok(d == "select * from t where val=?",
		"digest: NULL replaced with ? when replace_null=true");
}

static void test_digest_null_no_replacement() {
	std::string d = digest_query("SELECT * FROM t WHERE val=NULL",
		/* lowercase */ true, /* replace_null */ false);
	ok(d.find("NULL") != std::string::npos || d.find("null") != std::string::npos,
		"digest: NULL preserved when replace_null=false");
}

// ============================================================================
// 4. mysql_query_digest_and_first_comment — comment handling
// ============================================================================

static void test_digest_comment_stripped() {
	std::string d = digest_query("/* comment */ SELECT 1");
	ok(d == "select ?",
		"digest: block comment stripped");
}

static void test_digest_inline_comment() {
	std::string d = digest_query("SELECT 1 -- inline comment\n");
	ok(d == "select ?",
		"digest: inline comment (--) stripped");
}

static void test_digest_hash_comment() {
	std::string d = digest_query("SELECT 1 # hash comment\n");
	ok(d == "select ?",
		"digest: hash comment stripped");
}

static void test_digest_first_comment_extracted() {
	char buf[2048];
	memset(buf, 0, sizeof(buf));
	char* first_comment = nullptr;

	options opts;
	opts.lowercase = true;
	opts.replace_null = true;
	opts.replace_number = true;
	opts.grouping_limit = 3;
	opts.groups_grouping_limit = 0;
	opts.keep_comment = true;
	opts.max_query_length = 2048;

	const char* query = "/* my_comment */ SELECT 1";
	mysql_query_digest_and_first_comment(query, (int)strlen(query), &first_comment, buf, &opts);

	ok(first_comment != nullptr,
		"digest: first_comment is not NULL when keep_comment=true");
	if (first_comment) {
		ok(strstr(first_comment, "my_comment") != nullptr,
			"digest: first_comment contains 'my_comment'");
		free(first_comment);
	} else {
		ok(false, "digest: first_comment contains 'my_comment' (skipped, was NULL)");
	}
}

// ============================================================================
// 5. mysql_query_digest_and_first_comment — lowercase
// ============================================================================

static void test_digest_lowercase() {
	std::string d = digest_query("SELECT * FROM Users", /* lowercase */ true);
	ok(d == "select * from users",
		"digest: keywords and identifiers lowercased");
}

static void test_digest_no_lowercase() {
	std::string d = digest_query("SELECT * FROM Users", /* lowercase */ false);
	ok(d == "SELECT * FROM Users",
		"digest: case preserved when lowercase=false");
}

// ============================================================================
// 6. mysql_query_digest_and_first_comment — grouping
// ============================================================================

static void test_digest_grouping_limit() {
	// grouping_limit applies within a single value list, e.g. IN (1,2,3,4,5)
	// With grouping_limit=2, values beyond the 2nd get collapsed to '...'
	std::string d = digest_query("SELECT * FROM t WHERE id IN (1,2,3,4,5)",
		/* lowercase */ true, /* replace_null */ true, /* no_digits */ true,
		/* grouping_limit */ 2);
	ok(d.find("...") != std::string::npos,
		"digest: grouping_limit=2 collapses excess IN-list values to '...'");
}

// ============================================================================
// 7. mysql_query_digest_first_stage (thread-local wrapper)
// ============================================================================

static void test_digest_first_stage_basic() {
	std::string d = digest_first_stage("SELECT * FROM t WHERE id=42");
	ok(d == "select * from t where id=?",
		"first_stage: basic SELECT with literal replacement");
}

static void test_digest_first_stage_string() {
	std::string d = digest_first_stage("SELECT * FROM t WHERE name='test'");
	ok(d == "select * from t where name=?",
		"first_stage: string literal replaced");
}

// ============================================================================
// 8. mysql_query_digest_and_first_comment_2 (thread-local wrapper)
// ============================================================================

static void test_digest_2_basic() {
	std::string d = digest_query_2("SELECT * FROM t WHERE id=1 AND name='foo'");
	ok(d == "select * from t where id=? and name=?",
		"digest_2: multiple literal types replaced via thread-local wrapper");
}

// ============================================================================
// 9. mysql_query_strip_comments
// ============================================================================

static void test_strip_comments_block() {
	const char* query = "/* comment */ SELECT 1";
	char* input = strdup(query);
	char* result = mysql_query_strip_comments(input, (int)strlen(input), true);

	ok(result != nullptr, "strip_comments: result is not NULL");
	if (result) {
		ok(strstr(result, "comment") == nullptr,
			"strip_comments: block comment removed");
		ok(strstr(result, "select") != nullptr,
			"strip_comments: SELECT keyword preserved (lowercased)");
		free(result);
	} else {
		ok(false, "strip_comments: block comment removed (skipped)");
		ok(false, "strip_comments: SELECT keyword preserved (skipped)");
	}
	free(input);
}

static void test_strip_comments_inline() {
	const char* query = "SELECT 1 -- inline\n";
	char* input = strdup(query);
	char* result = mysql_query_strip_comments(input, (int)strlen(input), false);

	ok(result != nullptr, "strip_comments inline: result is not NULL");
	if (result) {
		ok(strstr(result, "inline") == nullptr,
			"strip_comments inline: -- comment removed");
		free(result);
	} else {
		ok(false, "strip_comments inline: -- comment removed (skipped)");
	}
	free(input);
}

// ============================================================================
// 10. Edge cases
// ============================================================================

static void test_digest_empty_query() {
	std::string d = digest_query("");
	ok(d.empty(), "digest: empty query produces empty digest");
}

static void test_digest_whitespace_only() {
	std::string d = digest_query("   ");
	ok(d.empty() || d == " ",
		"digest: whitespace-only query produces empty or single-space digest");
}

static void test_digest_multiple_spaces() {
	std::string d = digest_query("SELECT  *   FROM    t");
	ok(d == "select * from t",
		"digest: multiple spaces collapsed to single space");
}

static void test_digest_backtick_identifiers() {
	std::string d = digest_query("SELECT * FROM `my_table` WHERE `id`=1");
	ok(d.find("my_table") != std::string::npos,
		"digest: backtick-quoted identifiers preserved");
}

static void test_digest_scientific_notation() {
	std::string d = digest_query("SELECT * FROM t WHERE val=1E10");
	ok(d == "select * from t where val=?",
		"digest: scientific notation replaced with ?");
}

static void test_digest_in_clause() {
	std::string d = digest_query("SELECT * FROM t WHERE id IN (1,2,3)");
	ok(d == "select * from t where id in (?,?,?)",
		"digest: IN clause literals replaced");
}

static void test_digest_in_clause_grouping() {
	std::string d = digest_query("SELECT * FROM t WHERE id IN (1,2,3,4,5)",
		/* lowercase */ true, /* replace_null */ true, /* no_digits */ true,
		/* grouping_limit */ 3);
	ok(d.find("...") != std::string::npos,
		"digest: IN clause with grouping_limit collapses to '...'");
}

// ============================================================================
// Main
// ============================================================================

int main() {
	plan(59);
	int rc = test_init_minimal();
	ok(rc == 0, "test_init_minimal() succeeds");

	// Set thread-local variables for functions that read them
	setup_digest_defaults();

	// tokenizer tests (19 tests)
	test_tokenizer_basic();            // 4
	test_tokenizer_empties_ok();       // 4
	test_tokenizer_no_empties();       // 3
	test_tokenizer_empty_string();     // 1
	test_tokenizer_null_input();       // 1
	test_tokenizer_no_delimiter_found(); // 2
	test_tokenizer_long_string();      // 3
	// subtotal: 18

	// c_split_2 tests (8 tests)
	test_c_split_2_basic();            // 2
	test_c_split_2_no_delimiter();     // 2
	test_c_split_2_empty_string();     // 2
	test_c_split_2_multiple_delimiters(); // 2
	// subtotal: 8

	// digest tests — number/literal replacement (9 tests)
	test_digest_simple_select();       // 1
	test_digest_insert_values();       // 1
	test_digest_string_literals();     // 1
	test_digest_double_quoted_string(); // 1
	test_digest_float_literal();       // 1
	test_digest_negative_number();     // 1
	test_digest_hex_literal();         // 1
	test_digest_null_replacement();    // 1
	test_digest_null_no_replacement(); // 1
	// subtotal: 9

	// digest tests — comment handling (5 tests)
	test_digest_comment_stripped();    // 1
	test_digest_inline_comment();      // 1
	test_digest_hash_comment();        // 1
	test_digest_first_comment_extracted(); // 2
	// subtotal: 5

	// digest tests — lowercase (2 tests)
	test_digest_lowercase();           // 1
	test_digest_no_lowercase();        // 1
	// subtotal: 2

	// digest tests — grouping (1 test)
	test_digest_grouping_limit();      // 1

	// first_stage tests (2 tests)
	test_digest_first_stage_basic();   // 1
	test_digest_first_stage_string();  // 1
	// subtotal: 2

	// digest_2 wrapper (1 test)
	test_digest_2_basic();             // 1

	// strip_comments (5 tests)
	test_strip_comments_block();       // 3
	test_strip_comments_inline();      // 2
	// subtotal: 5

	// edge cases (7 tests)
	test_digest_empty_query();          // 1
	test_digest_whitespace_only();      // 1
	test_digest_multiple_spaces();      // 1
	test_digest_backtick_identifiers(); // 1
	test_digest_scientific_notation();  // 1
	test_digest_in_clause();            // 1
	test_digest_in_clause_grouping();   // 1
	// Total: 1 + 18 + 8 + 9 + 5 + 2 + 1 + 2 + 1 + 5 + 7 = 59

	test_cleanup_minimal();
	return exit_status();
}