/** * @file vector_db_performance-t.cpp * @brief TAP unit tests for vector database performance * * Test Categories: * 1. Embedding generation timing for various text lengths * 2. KNN similarity search performance with different dataset sizes * 3. Cache hit vs miss performance comparison * 4. Concurrent access performance and thread safety * 5. Memory usage monitoring during vector operations * 6. Large dataset handling (1K+, 10K+ entries) * * @date 2026-01-16 */ #include "tap.h" #include #include #include #include #include #include #include #include #include // ============================================================================ // Mock structures and functions to simulate vector database operations // ============================================================================ // Mock embedding generation (simulates GenAI embedding) static std::vector mock_generate_embedding(const std::string& text) { // Simulate time taken for embedding generation based on text length // In real implementation, this would call GloGATH->embed_documents() // Simple mock: create a fixed-size embedding with values based on text std::vector embedding(1536, 0.0f); // Standard embedding size // Fill with pseudo-random values based on text content unsigned int hash = 0; for (char c : text) { hash = hash * 31 + static_cast(c); } // Use hash to generate deterministic but varied embedding values for (size_t i = 0; i < embedding.size() && i < sizeof(hash); i++) { embedding[i] = static_cast((hash >> (i * 8)) & 0xFF) / 255.0f; } return embedding; } // Mock cache entry structure struct MockCacheEntry { std::string natural_language; std::string generated_sql; std::vector embedding; long long timestamp; }; // Mock vector database class MockVectorDB { private: std::vector entries; size_t max_entries; public: MockVectorDB(size_t max_size = 10000) : max_entries(max_size) {} // Simulate cache storage with timing long long store_entry(const std::string& query, const std::string& sql) { auto start = std::chrono::high_resolution_clock::now(); // Generate embedding std::vector embedding = mock_generate_embedding(query); // Check if we need to evict old entries if (entries.size() >= max_entries) { // Remove oldest entry (simple FIFO) entries.erase(entries.begin()); } // Add new entry MockCacheEntry entry; entry.natural_language = query; entry.generated_sql = sql; entry.embedding = embedding; entry.timestamp = std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()).count(); entries.push_back(entry); auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end - start); return duration.count(); } // Simulate cache lookup with timing std::pair lookup_entry(const std::string& query, float similarity_threshold = 0.85f) { auto start = std::chrono::high_resolution_clock::now(); // Generate embedding for query std::vector query_embedding = mock_generate_embedding(query); // Find best match using cosine similarity float best_similarity = -1.0f; std::string best_sql = ""; for (const auto& entry : entries) { float similarity = cosine_similarity(query_embedding, entry.embedding); if (similarity > best_similarity && similarity >= similarity_threshold) { best_similarity = similarity; best_sql = entry.generated_sql; } } auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end - start); return std::make_pair(duration.count(), best_sql); } // Calculate cosine similarity between two vectors float cosine_similarity(const std::vector& a, const std::vector& b) { if (a.size() != b.size() || a.empty()) return 0.0f; float dot_product = 0.0f; float norm_a = 0.0f; float norm_b = 0.0f; for (size_t i = 0; i < a.size(); i++) { dot_product += a[i] * b[i]; norm_a += a[i] * a[i]; norm_b += b[i] * b[i]; } if (norm_a == 0.0f || norm_b == 0.0f) return 0.0f; return dot_product / (sqrt(norm_a) * sqrt(norm_b)); } size_t size() const { return entries.size(); } void clear() { entries.clear(); } }; // ============================================================================ // Test: Embedding Generation Timing // ============================================================================ void test_embedding_timing() { diag("=== Embedding Generation Timing ==="); // Test with different text lengths std::vector test_texts = { "Short query", "A medium length query with more words to process", "A very long query that contains many words and should take more time to process because it has significantly more text content that needs to be analyzed and converted into embeddings for vector database operations", std::string(1000, 'A') // Very long text }; std::vector timings; for (const auto& text : test_texts) { auto start = std::chrono::high_resolution_clock::now(); auto embedding = mock_generate_embedding(text); auto end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast(end - start); timings.push_back(duration.count()); ok(embedding.size() == 1536, "Embedding has correct size for text length %zu", text.length()); } // Verify that longer texts take more time (roughly) ok(timings[0] <= timings[1], "Medium text takes longer than short text"); ok(timings[1] <= timings[2], "Long text takes longer than medium text"); diag("Embedding times (microseconds): Short=%lld, Medium=%lld, Long=%lld, VeryLong=%lld", timings[0], timings[1], timings[2], timings[3]); } // ============================================================================ // Test: KNN Search Performance // ============================================================================ void test_knn_search_performance() { diag("=== KNN Search Performance ==="); MockVectorDB db; // Populate database with test entries const size_t small_dataset = 100; const size_t medium_dataset = 1000; const size_t large_dataset = 10000; // Test with small dataset for (size_t i = 0; i < small_dataset; i++) { std::string query = "Test query " + std::to_string(i); std::string sql = "SELECT * FROM table WHERE id = " + std::to_string(i); db.store_entry(query, sql); } // Test search performance auto result = db.lookup_entry("Test query 50"); ok(result.second == "SELECT * FROM table WHERE id = 50" || result.second.empty(), "Search finds correct entry or no match in small dataset"); diag("Small dataset (%zu entries) search time: %lld microseconds", small_dataset, result.first); // Clear and test with medium dataset db.clear(); for (size_t i = 0; i < medium_dataset; i++) { std::string query = "Test query " + std::to_string(i); std::string sql = "SELECT * FROM table WHERE id = " + std::to_string(i); db.store_entry(query, sql); } result = db.lookup_entry("Test query 500"); ok(result.second == "SELECT * FROM table WHERE id = 500" || result.second.empty(), "Search finds correct entry or no match in medium dataset"); diag("Medium dataset (%zu entries) search time: %lld microseconds", medium_dataset, result.first); // Test with query that won't match exactly (tests full search) result = db.lookup_entry("Completely different query"); ok(result.second.empty(), "No match found for completely different query"); diag("Non-matching query search time: %lld microseconds", result.first); } // ============================================================================ // Test: Cache Hit vs Miss Performance // ============================================================================ void test_cache_hit_miss_performance() { diag("=== Cache Hit vs Miss Performance ==="); MockVectorDB db; // Add some entries db.store_entry("Show me all users", "SELECT * FROM users;"); db.store_entry("Count the orders", "SELECT COUNT(*) FROM orders;"); // Test cache hit auto hit_result = db.lookup_entry("Show me all users"); ok(!hit_result.second.empty(), "Cache hit returns result"); // Test cache miss auto miss_result = db.lookup_entry("List all products"); ok(miss_result.second.empty(), "Cache miss returns empty result"); // Verify hit is faster than miss (should be roughly similar in mock, but let's check) diag("Cache hit time: %lld microseconds, Cache miss time: %lld microseconds", hit_result.first, miss_result.first); // Both should be reasonable times ok(hit_result.first < 100000, "Cache hit time is reasonable (< 100ms)"); ok(miss_result.first < 100000, "Cache miss time is reasonable (< 100ms)"); } // ============================================================================ // Test: Memory Usage Monitoring // ============================================================================ void test_memory_usage() { diag("=== Memory Usage Monitoring ==="); // This is a conceptual test - in real implementation, we would monitor actual memory usage // For now, we'll test that the database doesn't grow unreasonably MockVectorDB db(1000); // Limit to 1000 entries // Add many entries for (size_t i = 0; i < 500; i++) { std::string query = "Query " + std::to_string(i); std::string sql = "SELECT * FROM table WHERE id = " + std::to_string(i); db.store_entry(query, sql); } ok(db.size() == 500, "Database has expected number of entries (500)"); // Add more entries to test size limit for (size_t i = 500; i < 1200; i++) { std::string query = "Query " + std::to_string(i); std::string sql = "SELECT * FROM table WHERE id = " + std::to_string(i); db.store_entry(query, sql); } // Should be capped at 1000 entries due to limit ok(db.size() <= 1000, "Database size respects maximum limit"); diag("Database size after adding 1200 entries: %zu", db.size()); } // ============================================================================ // Test: Large Dataset Handling // ============================================================================ void test_large_dataset_handling() { diag("=== Large Dataset Handling ==="); MockVectorDB db; // Test handling of large dataset (10K entries) const size_t large_size = 10000; auto start_insert = std::chrono::high_resolution_clock::now(); // Insert large number of entries for (size_t i = 0; i < large_size; i++) { std::string query = "Large dataset query " + std::to_string(i); std::string sql = "SELECT * FROM large_table WHERE id = " + std::to_string(i); // Every 1000 entries, report progress if (i % 1000 == 0 && i > 0) { diag("Inserted %zu entries...", i); } db.store_entry(query, sql); } auto end_insert = std::chrono::high_resolution_clock::now(); auto insert_duration = std::chrono::duration_cast(end_insert - start_insert); ok(db.size() == large_size, "Large dataset (%zu entries) inserted successfully", large_size); diag("Time to insert %zu entries: %ld ms", large_size, insert_duration.count()); // Test search performance in large dataset auto search_result = db.lookup_entry("Large dataset query 5000"); ok(search_result.second == "SELECT * FROM large_table WHERE id = 5000" || search_result.second.empty(), "Search works in large dataset"); diag("Search time in %zu entry dataset: %lld microseconds", large_size, search_result.first); // Performance should be reasonable even with large dataset ok(search_result.first < 500000, "Search time reasonable in large dataset (< 500ms)"); ok(insert_duration.count() < 30000, "Insert time reasonable for large dataset (< 30s)"); } // ============================================================================ // Test: Concurrent Access Performance // ============================================================================ void test_concurrent_access() { diag("=== Concurrent Access Performance ==="); // This is a simplified test - in real implementation, we would test actual thread safety MockVectorDB db; // Populate with some data for (size_t i = 0; i < 100; i++) { std::string query = "Concurrent test " + std::to_string(i); std::string sql = "SELECT * FROM concurrent_table WHERE id = " + std::to_string(i); db.store_entry(query, sql); } // Simulate concurrent access by running multiple operations const int num_operations = 10; std::vector timings; auto start = std::chrono::high_resolution_clock::now(); for (int i = 0; i < num_operations; i++) { auto result = db.lookup_entry("Concurrent test " + std::to_string(i * 2)); timings.push_back(result.first); } auto end = std::chrono::high_resolution_clock::now(); auto total_duration = std::chrono::duration_cast(end - start); // All operations should complete successfully ok(timings.size() == static_cast(num_operations), "All concurrent operations completed"); // Calculate average time long long total_time = 0; for (long long time : timings) { total_time += time; } long long avg_time = total_time / num_operations; diag("Average time per concurrent operation: %lld microseconds", avg_time); diag("Total time for %d operations: %ld microseconds", num_operations, total_duration.count()); // Operations should be reasonably fast ok(avg_time < 50000, "Average concurrent operation time reasonable (< 50ms)"); } // ============================================================================ // Main // ============================================================================ int main() { // Plan: 25 tests total // Embedding timing: 5 tests // KNN search performance: 4 tests // Cache hit vs miss: 3 tests // Memory usage: 3 tests // Large dataset handling: 5 tests // Concurrent access: 5 tests plan(25); test_embedding_timing(); test_knn_search_performance(); test_cache_hit_miss_performance(); test_memory_usage(); test_large_dataset_handling(); test_concurrent_access(); return exit_status(); }