proxysql/test/tap/tests/vector_db_performance-t.cpp

/**
 * @file vector_db_performance-t.cpp
 * @brief TAP unit tests for vector database performance
 *
 * Test Categories:
 * 1. Embedding generation timing for various text lengths
 * 2. KNN similarity search performance with different dataset sizes
 * 3. Cache hit vs miss performance comparison
 * 4. Concurrent access performance and thread safety
 * 5. Memory usage monitoring during vector operations
 * 6. Large dataset handling (1K+, 10K+ entries)
 *
 * @date 2026-01-16
 */

#include "tap.h"
#include <string>
#include <string.h>
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <vector>
#include <chrono>
#include <thread>
#include <algorithm>

// ============================================================================
// Mock structures and functions to simulate vector database operations
// ============================================================================

// Mock embedding generation (simulates GenAI embedding)
static std::vector<float> mock_generate_embedding(const std::string& text) {
	// Simulate time taken for embedding generation based on text length
	// In real implementation, this would call GloGATH->embed_documents()

	// Simple mock: create a fixed-size embedding with values based on text
	std::vector<float> embedding(1536, 0.0f); // Standard embedding size

	// Fill with pseudo-random values based on text content
	unsigned int hash = 0;
	for (char c : text) {
		hash = hash * 31 + static_cast<unsigned char>(c);
	}

	// Use hash to generate deterministic but varied embedding values
	for (size_t i = 0; i < embedding.size() && i < sizeof(hash); i++) {
		embedding[i] = static_cast<float>((hash >> (i * 8)) & 0xFF) / 255.0f;
	}

	return embedding;
}

// Mock cache entry structure
struct MockCacheEntry {
	std::string natural_language;
	std::string generated_sql;
	std::vector<float> embedding;
	long long timestamp;
};

// Mock vector database
class MockVectorDB {
private:
	std::vector<MockCacheEntry> entries;
	size_t max_entries;

public:
	MockVectorDB(size_t max_size = 10000) : max_entries(max_size) {}

	// Simulate cache storage with timing
	long long store_entry(const std::string& query, const std::string& sql) {
		auto start = std::chrono::high_resolution_clock::now();

		// Generate embedding
		std::vector<float> embedding = mock_generate_embedding(query);

		// Check if we need to evict old entries
		if (entries.size() >= max_entries) {
			// Remove oldest entry (simple FIFO)
			entries.erase(entries.begin());
		}

		// Add new entry
		MockCacheEntry entry;
		entry.natural_language = query;
		entry.generated_sql = sql;
		entry.embedding = embedding;
		entry.timestamp = std::chrono::duration_cast<std::chrono::milliseconds>(
			std::chrono::system_clock::now().time_since_epoch()).count();

		entries.push_back(entry);

		auto end = std::chrono::high_resolution_clock::now();
		auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
		return duration.count();
	}

	// Simulate cache lookup with timing
	std::pair<long long, std::string> lookup_entry(const std::string& query, float similarity_threshold = 0.85f) {
		auto start = std::chrono::high_resolution_clock::now();

		// Generate embedding for query
		std::vector<float> query_embedding = mock_generate_embedding(query);

		// Find best match using cosine similarity
		float best_similarity = -1.0f;
		std::string best_sql = "";

		for (const auto& entry : entries) {
			float similarity = cosine_similarity(query_embedding, entry.embedding);
			if (similarity > best_similarity && similarity >= similarity_threshold) {
				best_similarity = similarity;
				best_sql = entry.generated_sql;
			}
		}

		auto end = std::chrono::high_resolution_clock::now();
		auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
		return std::make_pair(duration.count(), best_sql);
	}

	// Calculate cosine similarity between two vectors
	float cosine_similarity(const std::vector<float>& a, const std::vector<float>& b) {
		if (a.size() != b.size() || a.empty()) return 0.0f;

		float dot_product = 0.0f;
		float norm_a = 0.0f;
		float norm_b = 0.0f;

		for (size_t i = 0; i < a.size(); i++) {
			dot_product += a[i] * b[i];
			norm_a += a[i] * a[i];
			norm_b += b[i] * b[i];
		}

		if (norm_a == 0.0f || norm_b == 0.0f) return 0.0f;

		return dot_product / (sqrt(norm_a) * sqrt(norm_b));
	}

	size_t size() const { return entries.size(); }
	void clear() { entries.clear(); }
};

// ============================================================================
// Test: Embedding Generation Timing
// ============================================================================

void test_embedding_timing() {
	diag("=== Embedding Generation Timing ===");

	// Test with different text lengths
	std::vector<std::string> test_texts = {
		"Short query",
		"A medium length query with more words to process",
		"A very long query that contains many words and should take more time to process because it has significantly more text content that needs to be analyzed and converted into embeddings for vector database operations",
		std::string(1000, 'A') // Very long text
	};

	std::vector<long long> timings;

	for (const auto& text : test_texts) {
		auto start = std::chrono::high_resolution_clock::now();
		auto embedding = mock_generate_embedding(text);
		auto end = std::chrono::high_resolution_clock::now();

		auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
		timings.push_back(duration.count());

		ok(embedding.size() == 1536, "Embedding has correct size for text length %zu", text.length());
	}

	// Verify that longer texts take more time (roughly)
	ok(timings[0] <= timings[1], "Medium text takes longer than short text");
	ok(timings[1] <= timings[2], "Long text takes longer than medium text");

	diag("Embedding times (microseconds): Short=%lld, Medium=%lld, Long=%lld, VeryLong=%lld",
	     timings[0], timings[1], timings[2], timings[3]);
}

// ============================================================================
// Test: KNN Search Performance
// ============================================================================

void test_knn_search_performance() {
	diag("=== KNN Search Performance ===");

	MockVectorDB db;

	// Populate database with test entries
	const size_t small_dataset = 100;
	const size_t medium_dataset = 1000;
	const size_t large_dataset = 10000;

	// Test with small dataset
	for (size_t i = 0; i < small_dataset; i++) {
		std::string query = "Test query " + std::to_string(i);
		std::string sql = "SELECT * FROM table WHERE id = " + std::to_string(i);
		db.store_entry(query, sql);
	}

	// Test search performance
	auto result = db.lookup_entry("Test query 50");
	ok(result.second == "SELECT * FROM table WHERE id = 50" || result.second.empty(),
	   "Search finds correct entry or no match in small dataset");

	diag("Small dataset (%zu entries) search time: %lld microseconds", small_dataset, result.first);

	// Clear and test with medium dataset
	db.clear();
	for (size_t i = 0; i < medium_dataset; i++) {
		std::string query = "Test query " + std::to_string(i);
		std::string sql = "SELECT * FROM table WHERE id = " + std::to_string(i);
		db.store_entry(query, sql);
	}

	result = db.lookup_entry("Test query 500");
	ok(result.second == "SELECT * FROM table WHERE id = 500" || result.second.empty(),
	   "Search finds correct entry or no match in medium dataset");

	diag("Medium dataset (%zu entries) search time: %lld microseconds", medium_dataset, result.first);

	// Test with query that won't match exactly (tests full search)
	result = db.lookup_entry("Completely different query");
	ok(result.second.empty(), "No match found for completely different query");

	diag("Non-matching query search time: %lld microseconds", result.first);
}

// ============================================================================
// Test: Cache Hit vs Miss Performance
// ============================================================================

void test_cache_hit_miss_performance() {
	diag("=== Cache Hit vs Miss Performance ===");

	MockVectorDB db;

	// Add some entries
	db.store_entry("Show me all users", "SELECT * FROM users;");
	db.store_entry("Count the orders", "SELECT COUNT(*) FROM orders;");

	// Test cache hit
	auto hit_result = db.lookup_entry("Show me all users");
	ok(!hit_result.second.empty(), "Cache hit returns result");

	// Test cache miss
	auto miss_result = db.lookup_entry("List all products");
	ok(miss_result.second.empty(), "Cache miss returns empty result");

	// Verify hit is faster than miss (should be roughly similar in mock, but let's check)
	diag("Cache hit time: %lld microseconds, Cache miss time: %lld microseconds",
	     hit_result.first, miss_result.first);

	// Both should be reasonable times
	ok(hit_result.first < 100000, "Cache hit time is reasonable (< 100ms)");
	ok(miss_result.first < 100000, "Cache miss time is reasonable (< 100ms)");
}

// ============================================================================
// Test: Memory Usage Monitoring
// ============================================================================

void test_memory_usage() {
	diag("=== Memory Usage Monitoring ===");

	// This is a conceptual test - in real implementation, we would monitor actual memory usage
	// For now, we'll test that the database doesn't grow unreasonably

	MockVectorDB db(1000); // Limit to 1000 entries

	// Add many entries
	for (size_t i = 0; i < 500; i++) {
		std::string query = "Query " + std::to_string(i);
		std::string sql = "SELECT * FROM table WHERE id = " + std::to_string(i);
		db.store_entry(query, sql);
	}

	ok(db.size() == 500, "Database has expected number of entries (500)");

	// Add more entries to test size limit
	for (size_t i = 500; i < 1200; i++) {
		std::string query = "Query " + std::to_string(i);
		std::string sql = "SELECT * FROM table WHERE id = " + std::to_string(i);
		db.store_entry(query, sql);
	}

	// Should be capped at 1000 entries due to limit
	ok(db.size() <= 1000, "Database size respects maximum limit");

	diag("Database size after adding 1200 entries: %zu", db.size());
}

// ============================================================================
// Test: Large Dataset Handling
// ============================================================================

void test_large_dataset_handling() {
	diag("=== Large Dataset Handling ===");

	MockVectorDB db;

	// Test handling of large dataset (10K entries)
	const size_t large_size = 10000;

	auto start_insert = std::chrono::high_resolution_clock::now();

	// Insert large number of entries
	for (size_t i = 0; i < large_size; i++) {
		std::string query = "Large dataset query " + std::to_string(i);
		std::string sql = "SELECT * FROM large_table WHERE id = " + std::to_string(i);

		// Every 1000 entries, report progress
		if (i % 1000 == 0 && i > 0) {
			diag("Inserted %zu entries...", i);
		}

		db.store_entry(query, sql);
	}

	auto end_insert = std::chrono::high_resolution_clock::now();
	auto insert_duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_insert - start_insert);

	ok(db.size() == large_size, "Large dataset (%zu entries) inserted successfully", large_size);
	diag("Time to insert %zu entries: %ld ms", large_size, insert_duration.count());

	// Test search performance in large dataset
	auto search_result = db.lookup_entry("Large dataset query 5000");
	ok(search_result.second == "SELECT * FROM large_table WHERE id = 5000" || search_result.second.empty(),
	   "Search works in large dataset");

	diag("Search time in %zu entry dataset: %lld microseconds", large_size, search_result.first);

	// Performance should be reasonable even with large dataset
	ok(search_result.first < 500000, "Search time reasonable in large dataset (< 500ms)");
	ok(insert_duration.count() < 30000, "Insert time reasonable for large dataset (< 30s)");
}

// ============================================================================
// Test: Concurrent Access Performance
// ============================================================================

void test_concurrent_access() {
	diag("=== Concurrent Access Performance ===");

	// This is a simplified test - in real implementation, we would test actual thread safety
	MockVectorDB db;

	// Populate with some data
	for (size_t i = 0; i < 100; i++) {
		std::string query = "Concurrent test " + std::to_string(i);
		std::string sql = "SELECT * FROM concurrent_table WHERE id = " + std::to_string(i);
		db.store_entry(query, sql);
	}

	// Simulate concurrent access by running multiple operations
	const int num_operations = 10;
	std::vector<long long> timings;

	auto start = std::chrono::high_resolution_clock::now();

	for (int i = 0; i < num_operations; i++) {
		auto result = db.lookup_entry("Concurrent test " + std::to_string(i * 2));
		timings.push_back(result.first);
	}

	auto end = std::chrono::high_resolution_clock::now();
	auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);

	// All operations should complete successfully
	ok(timings.size() == static_cast<size_t>(num_operations), "All concurrent operations completed");

	// Calculate average time
	long long total_time = 0;
	for (long long time : timings) {
		total_time += time;
	}
	long long avg_time = total_time / num_operations;

	diag("Average time per concurrent operation: %lld microseconds", avg_time);
	diag("Total time for %d operations: %ld microseconds", num_operations, total_duration.count());

	// Operations should be reasonably fast
	ok(avg_time < 50000, "Average concurrent operation time reasonable (< 50ms)");
}

// ============================================================================
// Main
// ============================================================================

int main() {
	// Plan: 25 tests total
	// Embedding timing: 5 tests
	// KNN search performance: 4 tests
	// Cache hit vs miss: 3 tests
	// Memory usage: 3 tests
	// Large dataset handling: 5 tests
	// Concurrent access: 5 tests
	plan(25);

	test_embedding_timing();
	test_knn_search_performance();
	test_cache_hit_miss_performance();
	test_memory_usage();
	test_large_dataset_handling();
	test_concurrent_access();

	return exit_status();
}