You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
proxysql/include/LLM_Bridge.h

338 lines
12 KiB

/**
* @file llm_bridge.h
* @brief Generic LLM Bridge for ProxySQL
*
* The LLM_Bridge class provides a generic interface to Large Language Models
* using multiple LLM providers with hybrid deployment and vector-based
* semantic caching.
*
* Key Features:
* - Multi-provider LLM support (local + generic cloud)
* - Semantic similarity caching using sqlite-vec
* - Generic prompt handling (not SQL-specific)
* - Configurable model selection based on latency/budget
* - Generic provider support (OpenAI-compatible, Anthropic-compatible)
*
* @date 2025-01-17
* @version 1.0.0
*
* Example Usage:
* @code
* LLMRequest req;
* req.prompt = "Summarize this data...";
* LLMResult result = bridge->process(req);
* std::cout << result.text_response << std::endl;
* @endcode
*/
#ifndef __CLASS_LLM_BRIDGE_H
#define __CLASS_LLM_BRIDGE_H
#ifdef PROXYSQLGENAI
#define LLM_BRIDGE_VERSION "1.0.0"
#include "proxysql.h"
#include <string>
#include <vector>
// Forward declarations
class SQLite3DB;
/**
* @brief Result structure for LLM bridge processing
*
* Contains the LLM text response along with metadata including
* cache status, error details, and performance timing.
*
* @note When errors occur, error_code, error_details, and http_status_code
* provide diagnostic information for troubleshooting.
*/
struct LLMResult {
std::string text_response; ///< LLM-generated text response
std::string explanation; ///< Which model generated this
bool cached; ///< True if from semantic cache
int64_t cache_id; ///< Cache entry ID for tracking
// Error details - populated when processing fails
std::string error_code; ///< Structured error code (e.g., "ERR_API_KEY_MISSING")
std::string error_details; ///< Detailed error context with query, provider, URL
int http_status_code; ///< HTTP status code if applicable (0 if N/A)
std::string provider_used; ///< Which provider was attempted
// Performance timing information
int total_time_ms; ///< Total processing time in milliseconds
int cache_lookup_time_ms; ///< Cache lookup time in milliseconds
int cache_store_time_ms; ///< Cache store time in milliseconds
int llm_call_time_ms; ///< LLM call time in milliseconds
bool cache_hit; ///< True if cache was hit
LLMResult() : cached(false), cache_id(0), http_status_code(0),
total_time_ms(0), cache_lookup_time_ms(0), cache_store_time_ms(0),
llm_call_time_ms(0), cache_hit(false) {}
};
/**
* @brief Request structure for LLM bridge processing
*
* Contains the prompt text and context for LLM processing.
*
* @note If max_latency_ms is set and < 500ms, the system will prefer
* local Ollama regardless of provider preference.
*/
struct LLMRequest {
std::string prompt; ///< Prompt text for LLM
std::string system_message; ///< Optional system role message
std::string schema_name; ///< Optional schema/database context
int max_latency_ms; ///< Max acceptable latency (ms)
bool allow_cache; ///< Enable semantic cache lookup
// Request tracking for correlation and debugging
std::string request_id; ///< Unique ID for this request (UUID-like)
// Retry configuration for transient failures
int max_retries; ///< Maximum retry attempts (default: 3)
int retry_backoff_ms; ///< Initial backoff in ms (default: 1000)
double retry_multiplier; ///< Backoff multiplier (default: 2.0)
int retry_max_backoff_ms; ///< Maximum backoff in ms (default: 30000)
LLMRequest() : max_latency_ms(0), allow_cache(true),
max_retries(3), retry_backoff_ms(1000),
retry_multiplier(2.0), retry_max_backoff_ms(30000) {
// Generate UUID-like request ID for correlation
char uuid[64];
snprintf(uuid, sizeof(uuid), "%08lx-%04x-%04x-%04x-%012lx",
(unsigned long)rand(), (unsigned)rand() & 0xffff,
(unsigned)rand() & 0xffff, (unsigned)rand() & 0xffff,
(unsigned long)rand() & 0xffffffffffff);
request_id = uuid;
}
};
/**
* @brief Error codes for LLM bridge processing
*
* Structured error codes that provide machine-readable error information
* for programmatic handling and user-friendly error messages.
*
* Error codes are strings that can be used for:
* - Conditional logic (switch on error type)
* - Logging and monitoring
* - User error messages
*
* @see llm_error_code_to_string()
*/
enum class LLMErrorCode {
SUCCESS = 0, ///< No error
ERR_API_KEY_MISSING, ///< API key not configured
ERR_API_KEY_INVALID, ///< API key format is invalid
ERR_TIMEOUT, ///< Request timed out
ERR_CONNECTION_FAILED, ///< Network connection failed
ERR_RATE_LIMITED, ///< Rate limited by provider (HTTP 429)
ERR_SERVER_ERROR, ///< Server error (HTTP 5xx)
ERR_EMPTY_RESPONSE, ///< Empty response from LLM
ERR_INVALID_RESPONSE, ///< Malformed response from LLM
ERR_VALIDATION_FAILED, ///< Input validation failed
ERR_UNKNOWN_PROVIDER, ///< Invalid provider name
ERR_REQUEST_TOO_LARGE ///< Request exceeds size limit
};
/**
* @brief Convert error code enum to string representation
*
* Returns the string representation of an error code for logging
* and display purposes.
*
* @param code The error code to convert
* @return String representation of the error code
*/
const char* llm_error_code_to_string(LLMErrorCode code);
/**
* @brief Model provider format types for LLM bridge
*
* Defines the API format to use for generic providers:
* - GENERIC_OPENAI: Any OpenAI-compatible endpoint (including Ollama)
* - GENERIC_ANTHROPIC: Any Anthropic-compatible endpoint
* - FALLBACK_ERROR: No model available (error state)
*
* @note For all providers, URL and API key are configured via variables.
* Ollama can be used via its OpenAI-compatible endpoint at /v1/chat/completions.
*
* @note Missing API keys will result in error (no automatic fallback).
*/
enum class ModelProvider {
GENERIC_OPENAI, ///< Any OpenAI-compatible endpoint (configurable URL)
GENERIC_ANTHROPIC, ///< Any Anthropic-compatible endpoint (configurable URL)
FALLBACK_ERROR ///< No model available (error state)
};
/**
* @brief Generic LLM Bridge class
*
* Processes prompts using LLMs with hybrid local/cloud model support
* and vector cache.
*
* Architecture:
* - Vector cache for semantic similarity (sqlite-vec)
* - Model selection based on latency/budget
* - Generic HTTP client (libcurl) supporting multiple API formats
* - Generic prompt handling (not tied to SQL)
*
* Configuration Variables:
* - genai_llm_provider: "ollama", "openai", or "anthropic"
* - genai_llm_provider_url: Custom endpoint URL (for generic providers)
* - genai_llm_provider_model: Model name
* - genai_llm_provider_key: API key (optional for local)
*
* Thread Safety:
* - This class is NOT thread-safe by itself
* - External locking must be provided by AI_Features_Manager
*
* @see AI_Features_Manager, LLMRequest, LLMResult
*/
class LLM_Bridge {
private:
struct {
bool enabled;
char* provider; ///< "openai" or "anthropic"
char* provider_url; ///< Generic endpoint URL
char* provider_model; ///< Model name
char* provider_key; ///< API key
int cache_similarity_threshold;
int timeout_ms;
} config;
SQLite3DB* vector_db;
// Internal methods
std::string build_prompt(const LLMRequest& req);
std::string call_generic_openai(const std::string& prompt, const std::string& model,
const std::string& url, const char* key,
const std::string& req_id = "");
std::string call_generic_anthropic(const std::string& prompt, const std::string& model,
const std::string& url, const char* key,
const std::string& req_id = "");
// Retry wrapper methods
std::string call_generic_openai_with_retry(const std::string& prompt, const std::string& model,
const std::string& url, const char* key,
const std::string& req_id,
int max_retries, int initial_backoff_ms,
double backoff_multiplier, int max_backoff_ms);
std::string call_generic_anthropic_with_retry(const std::string& prompt, const std::string& model,
const std::string& url, const char* key,
const std::string& req_id,
int max_retries, int initial_backoff_ms,
double backoff_multiplier, int max_backoff_ms);
LLMResult check_cache(const LLMRequest& req);
void store_in_cache(const LLMRequest& req, const LLMResult& result);
ModelProvider select_model(const LLMRequest& req);
std::vector<float> get_text_embedding(const std::string& text);
public:
/**
* @brief Constructor - initializes with default configuration
*
* Sets up default values:
* - provider: "openai"
* - provider_url: "http://localhost:11434/v1/chat/completions" (Ollama default)
* - provider_model: "llama3.2"
* - cache_similarity_threshold: 85
* - timeout_ms: 30000
*/
LLM_Bridge();
/**
* @brief Destructor - frees allocated resources
*/
~LLM_Bridge();
/**
* @brief Initialize the LLM bridge
*
* Initializes vector DB connection and validates configuration.
* The vector_db will be provided by AI_Features_Manager.
*
* @return 0 on success, non-zero on failure
*/
int init();
/**
* @brief Shutdown the LLM bridge
*
* Closes vector DB connection and cleans up resources.
*/
void close();
/**
* @brief Set the vector database for caching
*
* Sets the vector database instance for semantic similarity caching.
* Called by AI_Features_Manager during initialization.
*
* @param db Pointer to SQLite3DB instance
*/
void set_vector_db(SQLite3DB* db) { vector_db = db; }
/**
* @brief Update configuration from AI_Features_Manager
*
* Copies configuration variables from AI_Features_Manager to internal config.
* This is called by AI_Features_Manager when variables change.
*/
void update_config(const char* provider, const char* provider_url, const char* provider_model,
const char* provider_key, int cache_threshold, int timeout);
/**
* @brief Process a prompt using the LLM
*
* This is the main entry point for LLM bridge processing. The flow is:
* 1. Check vector cache for semantically similar prompts
* 2. Build prompt with optional system message
* 3. Select appropriate model (Ollama or generic provider)
* 4. Call LLM API
* 5. Parse response
* 6. Store in vector cache for future use
*
* @param req LLM request containing prompt and context
* @return LLMResult with text response and metadata
*
* @note This is a synchronous blocking call. For non-blocking behavior,
* use the async interface via MySQL_Session.
*
* Example:
* @code
* LLMRequest req;
* req.prompt = "Explain this query: SELECT * FROM users";
* req.allow_cache = true;
* LLMResult result = bridge.process(req);
* std::cout << result.text_response << std::endl;
* @endcode
*/
LLMResult process(const LLMRequest& req);
/**
* @brief Clear the vector cache
*
* Removes all cached LLM responses from the vector database.
* This is useful for testing or when context changes significantly.
*/
void clear_cache();
/**
* @brief Get cache statistics
*
* Returns JSON string with cache metrics:
* - entries: Total number of cached responses
* - hits: Number of cache hits
* - misses: Number of cache misses
*
* @return JSON string with cache statistics
*/
std::string get_cache_stats();
};
#endif /* PROXYSQLGENAI */
#endif // __CLASS_LLM_BRIDGE_H