You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
proxysql/doc/sqlite-rembed-test.sh

574 lines
20 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#!/bin/bash
###############################################################################
# sqlite-rembed Integration Test Suite
#
# This script comprehensively tests the sqlite-rembed integration in ProxySQL,
# verifying all components of the embedding generation and vector search pipeline.
#
# Tests performed:
# 1. Basic connectivity to ProxySQL SQLite3 server
# 2. Function registration (rembed, rembed_client_options)
# 3. Client configuration in temp.rembed_clients virtual table
# 4. Embedding generation via remote HTTP API
# 5. Vector table creation and data storage
# 6. Similarity search with generated embeddings
# 7. Error handling and edge cases
#
# Requirements:
# - ProxySQL running with --sqlite3-server flag on port 6030
# - MySQL client installed
# - Network access to embedding API endpoint
# - Valid API credentials for embedding generation
#
# Usage: ./sqlite-rembed-test.sh
#
# Exit codes:
# 0 - All tests passed
# 1 - One or more tests failed
# 2 - Connection/proxy setup failed
#
# Author: Generated from integration testing session
# Date: $(date)
###############################################################################
set -euo pipefail
# Configuration - modify these values as needed
PROXYSQL_HOST="127.0.0.1"
PROXYSQL_PORT="6030"
MYSQL_USER="root"
MYSQL_PASS="root"
# API Configuration - using synthetic OpenAI endpoint for testing
# IMPORTANT: Set API_KEY environment variable or replace YOUR_API_KEY below
API_CLIENT_NAME="test-client-$(date +%s)"
API_FORMAT="openai"
API_URL="https://api.synthetic.new/openai/v1/embeddings"
API_KEY="${API_KEY:-YOUR_API_KEY}" # Uses environment variable or placeholder
API_MODEL="hf:nomic-ai/nomic-embed-text-v1.5"
VECTOR_DIMENSIONS=768 # Based on model output
# Test results tracking
TOTAL_TESTS=0
PASSED_TESTS=0
FAILED_TESTS=0
CURRENT_TEST=""
# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Text formatting
BOLD='\033[1m'
UNDERLINE='\033[4m'
###############################################################################
# Helper Functions
###############################################################################
print_header() {
echo -e "\n${BLUE}${BOLD}${UNDERLINE}$1${NC}\n"
}
print_test() {
echo -e "${YELLOW}[TEST]${NC} $1"
CURRENT_TEST="$1"
((TOTAL_TESTS++))
}
print_success() {
echo -e "${GREEN}✅ SUCCESS:${NC} $1"
((PASSED_TESTS++))
}
print_failure() {
echo -e "${RED}❌ FAILURE:${NC} $1"
echo " Error: $2"
((FAILED_TESTS++))
}
print_info() {
echo -e "${BLUE} INFO:${NC} $1"
}
# Execute MySQL query and capture results
execute_query() {
local sql_query="$1"
local capture_output="${2:-false}"
if [ "$capture_output" = "true" ]; then
mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \
-s -N -e "$sql_query" 2>&1
else
mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \
-e "$sql_query" 2>&1
fi
}
# Run a test and check for success
run_test() {
local test_name="$1"
local sql_query="$2"
local expected_pattern="${3:-}"
print_test "$test_name"
local result
result=$(execute_query "$sql_query" "true")
local exit_code=$?
if [ $exit_code -eq 0 ]; then
if [ -n "$expected_pattern" ] && ! echo "$result" | grep -q "$expected_pattern"; then
print_failure "$test_name" "Pattern '$expected_pattern' not found in output"
echo " Output: $result"
else
print_success "$test_name"
fi
else
print_failure "$test_name" "$result"
fi
}
# Clean up any existing test tables
cleanup_tables() {
print_info "Cleaning up existing test tables..."
local tables=(
"test_documents"
"test_embeddings"
"test_docs"
"test_embeds"
"documents"
"document_embeddings"
"demo_texts"
"demo_embeddings"
)
for table in "${tables[@]}"; do
execute_query "DROP TABLE IF EXISTS $table;" >/dev/null 2>&1
execute_query "DROP TABLE IF EXISTS ${table}_info;" >/dev/null 2>&1
execute_query "DROP TABLE IF EXISTS ${table}_chunks;" >/dev/null 2>&1
execute_query "DROP TABLE IF EXISTS ${table}_rowids;" >/dev/null 2>&1
execute_query "DROP TABLE IF EXISTS ${table}_vector_chunks00;" >/dev/null 2>&1
done
print_info "Cleanup completed"
}
# Print test summary
print_summary() {
echo -e "\n${BOLD}${UNDERLINE}Test Summary${NC}"
echo -e "${BOLD}Total Tests:${NC} $TOTAL_TESTS"
echo -e "${GREEN}${BOLD}Passed:${NC} $PASSED_TESTS"
if [ $FAILED_TESTS -gt 0 ]; then
echo -e "${RED}${BOLD}Failed:${NC} $FAILED_TESTS"
else
echo -e "${GREEN}${BOLD}Failed:${NC} $FAILED_TESTS"
fi
if [ $FAILED_TESTS -eq 0 ]; then
echo -e "\n${GREEN}🎉 All tests passed! sqlite-rembed integration is fully functional.${NC}"
return 0
else
echo -e "\n${RED}❌ Some tests failed. Please check the errors above.${NC}"
return 1
fi
}
###############################################################################
# Main Test Suite
###############################################################################
# Check for bc (calculator) for floating point math
if command -v bc &> /dev/null; then
HAS_BC=true
else
HAS_BC=false
print_info "bc calculator not found, using awk for float comparisons"
fi
# Check for awk (should be available on all POSIX systems)
if ! command -v awk &> /dev/null; then
echo -e "${RED}Error: awk not found. awk is required for this test suite.${NC}"
exit 2
fi
main() {
print_header "sqlite-rembed Integration Test Suite"
echo -e "Starting at: $(date)"
echo -e "ProxySQL: ${PROXYSQL_HOST}:${PROXYSQL_PORT}"
echo -e "API Endpoint: ${API_URL}"
echo ""
# Initial cleanup
cleanup_tables
###########################################################################
# Phase 1: Basic Connectivity and Function Verification
###########################################################################
print_header "Phase 1: Basic Connectivity and Function Verification"
# Test 1.1: Basic connectivity
run_test "Basic ProxySQL connectivity" \
"SELECT 1 as connectivity_test;" \
"1"
# Test 1.2: Check database
run_test "Database listing" \
"SHOW DATABASES;" \
"main"
# Test 1.3: Verify sqlite-vec functions exist
run_test "Check sqlite-vec functions" \
"SELECT name FROM pragma_function_list WHERE name LIKE 'vec%' LIMIT 1;" \
"vec"
# Test 1.4: Verify rembed functions are registered
run_test "Check rembed function registration" \
"SELECT name FROM pragma_function_list WHERE name LIKE 'rembed%' ORDER BY name;" \
"rembed"
# Test 1.5: Verify temp.rembed_clients virtual table schema
run_test "Check temp.rembed_clients table exists" \
"SELECT name FROM sqlite_master WHERE name='rembed_clients' AND type='table';" \
"rembed_clients"
###########################################################################
# Phase 2: Client Configuration
###########################################################################
print_header "Phase 2: Client Configuration"
# Test 2.1: Create embedding client
local create_client_sql="INSERT INTO temp.rembed_clients(name, options) VALUES
('$API_CLIENT_NAME',
rembed_client_options(
'format', '$API_FORMAT',
'url', '$API_URL',
'key', '$API_KEY',
'model', '$API_MODEL'
)
);"
run_test "Create embedding API client" \
"$create_client_sql" \
""
# Test 2.2: Verify client creation
run_test "Verify client in temp.rembed_clients" \
"SELECT name FROM temp.rembed_clients WHERE name='$API_CLIENT_NAME';" \
"$API_CLIENT_NAME"
# Test 2.3: Test rembed_client_options function
run_test "Test rembed_client_options function" \
"SELECT typeof(rembed_client_options('format', 'openai', 'model', 'test')) as options_type;" \
"text"
###########################################################################
# Phase 3: Embedding Generation Tests
###########################################################################
print_header "Phase 3: Embedding Generation Tests"
# Test 3.1: Generate simple embedding
run_test "Generate embedding for short text" \
"SELECT LENGTH(rembed('$API_CLIENT_NAME', 'hello world')) as embedding_length;" \
"$((VECTOR_DIMENSIONS * 4))" # 768 dimensions * 4 bytes per float
# Test 3.2: Test embedding type
run_test "Verify embedding data type" \
"SELECT typeof(rembed('$API_CLIENT_NAME', 'test')) as embedding_type;" \
"blob"
# Test 3.3: Generate embedding for longer text
run_test "Generate embedding for longer text" \
"SELECT LENGTH(rembed('$API_CLIENT_NAME', 'The quick brown fox jumps over the lazy dog')) as embedding_length;" \
"$((VECTOR_DIMENSIONS * 4))"
# Test 3.4: Error handling - non-existent client
print_test "Error handling: non-existent client"
local error_result
error_result=$(execute_query "SELECT rembed('non-existent-client', 'test');" "true")
if echo "$error_result" | grep -q "was not registered with rembed_clients"; then
print_success "Proper error for non-existent client"
else
print_failure "Error handling" "Expected error message not found: $error_result"
fi
###########################################################################
# Phase 4: Table Creation and Data Storage
###########################################################################
print_header "Phase 4: Table Creation and Data Storage"
# Test 4.1: Create regular table for documents
run_test "Create documents table" \
"CREATE TABLE test_documents (
id INTEGER PRIMARY KEY,
title TEXT NOT NULL,
content TEXT NOT NULL,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
);" \
""
# Test 4.2: Create virtual vector table
run_test "Create virtual vector table" \
"CREATE VIRTUAL TABLE test_embeddings USING vec0(
embedding float[$VECTOR_DIMENSIONS]
);" \
""
# Test 4.3: Insert test documents
local insert_docs_sql="INSERT INTO test_documents (id, title, content) VALUES
(1, 'Machine Learning', 'Machine learning algorithms improve with more training data and better features.'),
(2, 'Database Systems', 'Database management systems efficiently store, retrieve and manipulate data.'),
(3, 'Artificial Intelligence', 'AI enables computers to perform tasks typically requiring human intelligence.'),
(4, 'Vector Databases', 'Vector databases enable similarity search for embeddings and high-dimensional data.');"
run_test "Insert test documents" \
"$insert_docs_sql" \
""
# Test 4.4: Verify document insertion
run_test "Verify document count" \
"SELECT COUNT(*) as doc_count FROM test_documents;" \
"4"
###########################################################################
# Phase 5: Embedding Generation and Storage
###########################################################################
print_header "Phase 5: Embedding Generation and Storage"
# Test 5.1: Generate and store embeddings
run_test "Generate and store embeddings for all documents" \
"INSERT INTO test_embeddings(rowid, embedding)
SELECT id, rembed('$API_CLIENT_NAME', title || ': ' || content)
FROM test_documents;" \
""
# Test 5.2: Verify embeddings were stored
run_test "Verify embedding count matches document count" \
"SELECT COUNT(*) as embedding_count FROM test_embeddings;" \
"4"
# Test 5.3: Check embedding data structure
run_test "Check embedding storage format" \
"SELECT rowid, LENGTH(embedding) as bytes FROM test_embeddings LIMIT 1;" \
"$((VECTOR_DIMENSIONS * 4))"
###########################################################################
# Phase 6: Similarity Search Tests
###########################################################################
print_header "Phase 6: Similarity Search Tests"
# Test 6.1: Exact self-match (document 1 with itself)
local self_match_sql="WITH self_vec AS (
SELECT embedding FROM test_embeddings WHERE rowid = 1
)
SELECT d.id, d.title, e.distance
FROM test_documents d
JOIN test_embeddings e ON d.id = e.rowid
CROSS JOIN self_vec
WHERE e.embedding MATCH self_vec.embedding
ORDER BY e.distance ASC
LIMIT 3;"
print_test "Exact self-match similarity search"
local match_result
match_result=$(execute_query "$self_match_sql" "true")
if [ $? -eq 0 ] && echo "$match_result" | grep -q "1.*Machine Learning.*0.0"; then
print_success "Exact self-match works correctly"
echo " Result: Document 1 has distance 0.0 (exact match)"
else
print_failure "Self-match search" "Self-match failed or incorrect: $match_result"
fi
# Test 6.2: Similarity search with query text
local query_search_sql="WITH query_vec AS (
SELECT rembed('$API_CLIENT_NAME', 'data science and algorithms') as q
)
SELECT d.id, d.title, e.distance
FROM test_documents d
JOIN test_embeddings e ON d.id = e.rowid
CROSS JOIN query_vec
WHERE e.embedding MATCH query_vec.q
ORDER BY e.distance ASC
LIMIT 3;"
print_test "Similarity search with query text"
local search_result
search_result=$(execute_query "$query_search_sql" "true")
if [ $? -eq 0 ] && [ -n "$search_result" ]; then
print_success "Similarity search returns results"
echo " Results returned: $(echo "$search_result" | wc -l)"
else
print_failure "Similarity search" "Search failed: $search_result"
fi
# Test 6.3: Verify search ordering (distances should be ascending)
print_test "Verify search result ordering"
local distances
distances=$(echo "$search_result" | grep -o '[0-9]\+\.[0-9]\+' || true)
if [ -n "$distances" ]; then
# Check if distances are non-decreasing (allows equal distances)
local prev=-1
local ordered=true
for dist in $distances; do
if [ "$HAS_BC" = true ]; then
# Use bc for precise float comparison
if (( $(echo "$dist < $prev" | bc -l 2>/dev/null || echo "0") )); then
ordered=false
break
fi
else
# Use awk for float comparison (less precise but works)
if awk -v d="$dist" -v p="$prev" 'BEGIN { exit !(d >= p) }' 2>/dev/null; then
: # Distance is greater or equal, continue
else
ordered=false
break
fi
fi
prev=$dist
done
if [ "$ordered" = true ]; then
print_success "Results ordered by ascending distance"
else
print_failure "Result ordering" "Distances not in ascending order: $distances"
fi
else
print_info "No distances to verify ordering"
fi
###########################################################################
# Phase 7: Edge Cases and Error Handling
###########################################################################
print_header "Phase 7: Edge Cases and Error Handling"
# Test 7.1: Empty text input
run_test "Empty text input handling" \
"SELECT LENGTH(rembed('$API_CLIENT_NAME', '')) as empty_embedding_length;" \
"$((VECTOR_DIMENSIONS * 4))"
# Test 7.2: Very long text (ensure no truncation errors)
local long_text="This is a very long text string that should still generate an embedding. "
long_text="${long_text}${long_text}${long_text}${long_text}${long_text}" # 5x repetition
run_test "Long text input handling" \
"SELECT LENGTH(rembed('$API_CLIENT_NAME', '$long_text')) as long_text_length;" \
"$((VECTOR_DIMENSIONS * 4))"
# Test 7.3: SQL injection attempt in text parameter
run_test "SQL injection attempt handling" \
"SELECT LENGTH(rembed('$API_CLIENT_NAME', 'test'' OR ''1''=''1')) as injection_safe_length;" \
"$((VECTOR_DIMENSIONS * 4))"
###########################################################################
# Phase 8: Performance and Concurrency (Basic)
###########################################################################
print_header "Phase 8: Performance and Concurrency"
# Test 8.1: Sequential embedding generation timing
print_test "Sequential embedding generation timing"
local start_time
start_time=$(date +%s.%N)
execute_query "SELECT rembed('$API_CLIENT_NAME', 'performance test 1');
SELECT rembed('$API_CLIENT_NAME', 'performance test 2');
SELECT rembed('$API_CLIENT_NAME', 'performance test 3');" >/dev/null 2>&1
local end_time
end_time=$(date +%s.%N)
local elapsed
if [ "$HAS_BC" = true ]; then
elapsed=$(echo "$end_time - $start_time" | bc)
else
elapsed=$(awk -v s="$start_time" -v e="$end_time" 'BEGIN { printf "%.2f", e - s }' 2>/dev/null || echo "0")
fi
if [ "$HAS_BC" = true ]; then
if (( $(echo "$elapsed < 10" | bc -l) )); then
print_success "Sequential embeddings generated in ${elapsed}s"
else
print_failure "Performance" "Embedding generation took too long: ${elapsed}s"
fi
else
# Simple float comparison with awk
if awk -v e="$elapsed" 'BEGIN { exit !(e < 10) }' 2>/dev/null; then
print_success "Sequential embeddings generated in ${elapsed}s"
else
print_failure "Performance" "Embedding generation took too long: ${elapsed}s"
fi
fi
###########################################################################
# Phase 9: Cleanup and Final Verification
###########################################################################
print_header "Phase 9: Cleanup and Final Verification"
# Test 9.1: Cleanup test tables
run_test "Cleanup test tables" \
"DROP TABLE IF EXISTS test_documents;
DROP TABLE IF EXISTS test_embeddings;" \
""
# Test 9.2: Verify cleanup
run_test "Verify tables are removed" \
"SELECT COUNT(*) as remaining_tests FROM sqlite_master WHERE name LIKE 'test_%';" \
"0"
###########################################################################
# Final Summary
###########################################################################
print_header "Test Suite Complete"
echo -e "Embedding API Client: ${API_CLIENT_NAME}"
echo -e "Vector Dimensions: ${VECTOR_DIMENSIONS}"
echo -e "Total Operations Tested: ${TOTAL_TESTS}"
print_summary
local summary_exit=$?
# Final system status
echo -e "\n${BOLD}System Status:${NC}"
echo -e "ProxySQL SQLite3 Server: ${GREEN}✅ Accessible${NC}"
echo -e "sqlite-rembed Extension: ${GREEN}✅ Loaded${NC}"
echo -e "Embedding API: ${GREEN}✅ Responsive${NC}"
echo -e "Vector Search: ${GREEN}✅ Functional${NC}"
if [ $summary_exit -eq 0 ]; then
echo -e "\n${GREEN}${BOLD}✓ sqlite-rembed integration test suite completed successfully${NC}"
echo -e "All components are functioning correctly."
else
echo -e "\n${RED}${BOLD}✗ sqlite-rembed test suite completed with failures${NC}"
echo -e "Check the failed tests above for details."
fi
return $summary_exit
}
###############################################################################
# Script Entry Point
###############################################################################
# Check if mysql client is available
if ! command -v mysql &> /dev/null; then
echo -e "${RED}Error: MySQL client not found. Please install mysql-client.${NC}"
exit 2
fi
# Check connectivity to ProxySQL
if ! mysql -h "$PROXYSQL_HOST" -P "$PROXYSQL_PORT" -u "$MYSQL_USER" -p"$MYSQL_PASS" \
-e "SELECT 1;" &>/dev/null; then
echo -e "${RED}Error: Cannot connect to ProxySQL at ${PROXYSQL_HOST}:${PROXYSQL_PORT}${NC}"
echo "Make sure ProxySQL is running with: ./proxysql --sqlite3-server"
exit 2
fi
# Run main test suite
main
exit $?