docs: Add comprehensive doxygen comments to NL2SQL headers and LLM_Clients

- Add file-level doxygen documentation with @file, @brief, @date, @version - Add detailed class and method documentation with @param, @return, @note, @see - Document data structures (NL2SQLRequest, NL2SQLResult, ModelProvider) - Add section comments and inline documentation for implementation files - Document all three LLM provider APIs (Ollama, OpenAI, Anthropic)
4 months ago · 4f45c25945
parent bc4fff12ce
commit 4f45c25945
5 changed files with 438 additions and 29 deletions
--- a/include/AI_Features_Manager.h
+++ b/include/AI_Features_Manager.h
@ -1,3 +1,32 @@
+/**
+ * @file ai_features_manager.h
+ * @brief AI Features Manager for ProxySQL
+ *
+ * The AI_Features_Manager class coordinates all AI-related features in ProxySQL:
+ * - NL2SQL (Natural Language to SQL) conversion
+ * - Anomaly detection for security monitoring
+ * - Vector storage for semantic caching
+ * - Hybrid model routing (local Ollama + cloud APIs)
+ *
+ * Architecture:
+ * - Central configuration management with 'ai-' variable prefix
+ * - Thread-safe operations using pthread rwlock
+ * - Follows same pattern as MCP_Threads_Handler and GenAI_Threads_Handler
+ * - Coordinates with MySQL_Session for query interception
+ *
+ * @date 2025-01-16
+ * @version 0.1.0
+ *
+ * Example Usage:
+ * @code
+ * // Access NL2SQL converter
+ * NL2SQL_Converter* nl2sql = GloAI->get_nl2sql();
+ * NL2SQLRequest req;
+ * req.natural_language = "Show top customers";
+ * NL2SQLResult result = nl2sql->convert(req);
+ * @endcode
+ */
+
 #ifndef __CLASS_AI_FEATURES_MANAGER_H
 #define __CLASS_AI_FEATURES_MANAGER_H

@ -23,6 +52,12 @@ class SQLite3DB;
 *
 * This class follows the same pattern as MCP_Threads_Handler and GenAI_Threads_Handler
 * for configuration management and lifecycle.
+ *
+ * Thread Safety:
+ * - All public methods are thread-safe using pthread rwlock
+ * - Use wrlock()/wrunlock() for manual locking if needed
+ *
+ * @see NL2SQL_Converter, Anomaly_Detector
 */
 class AI_Features_Manager {
 private:
@ -97,28 +132,132 @@ public:
 		double daily_cloud_spend_usd;
 	} status_variables;

+	/**
+	 * @brief Constructor - initializes with default configuration
+	 */
 	AI_Features_Manager();
+
+	/**
+	 * @brief Destructor - cleanup resources
+	 */
 	~AI_Features_Manager();

-	// Lifecycle
+	/**
+	 * @brief Initialize all AI features
+	 *
+	 * Initializes vector database, NL2SQL converter, and anomaly detector.
+	 * This must be called after ProxySQL configuration is loaded.
+	 *
+	 * @return 0 on success, non-zero on failure
+	 */
 	int init();
+
+	/**
+	 * @brief Shutdown all AI features
+	 *
+	 * Gracefully shuts down all components and frees resources.
+	 * Safe to call multiple times.
+	 */
 	void shutdown();

-	// Thread-safe locking
+	/**
+	 * @brief Acquire write lock for thread-safe operations
+	 *
+	 * Use this for manual locking when performing multiple operations
+	 * that need to be atomic.
+	 *
+	 * @note Must be paired with wrunlock()
+	 */
 	void wrlock();
+
+	/**
+	 * @brief Release write lock
+	 *
+	 * @note Must be called after wrlock()
+	 */
 	void wrunlock();

-	// Component access
+	/**
+	 * @brief Get NL2SQL converter instance
+	 *
+	 * @return Pointer to NL2SQL_Converter or NULL if not initialized
+	 *
+	 * @note Thread-safe when called within wrlock()/wrunlock() pair
+	 */
 	NL2SQL_Converter* get_nl2sql() { return nl2sql_converter; }
+
+	/**
+	 * @brief Get anomaly detector instance
+	 *
+	 * @return Pointer to Anomaly_Detector or NULL if not initialized
+	 *
+	 * @note Thread-safe when called within wrlock()/wrunlock() pair
+	 */
 	Anomaly_Detector* get_anomaly_detector() { return anomaly_detector; }
+
+	/**
+	 * @brief Get vector database instance
+	 *
+	 * @return Pointer to SQLite3DB or NULL if not initialized
+	 *
+	 * @note Thread-safe when called within wrlock()/wrunlock() pair
+	 */
 	SQLite3DB* get_vector_db() { return vector_db; }

-	// Variable management (for admin interface)
+	/**
+	 * @brief Get configuration variable value
+	 *
+	 * Retrieves the value of an AI configuration variable by name.
+	 * Variable names should be without the 'ai_' prefix.
+	 *
+	 * @param name Variable name (e.g., "nl2sql_enabled")
+	 * @return Variable value or NULL if not found
+	 *
+	 * Example:
+	 * @code
+	 * char* enabled = GloAI->get_variable("nl2sql_enabled");
+	 * if (enabled && strcmp(enabled, "true") == 0) { ... }
+	 * @endcode
+	 */
 	char* get_variable(const char* name);
+
+	/**
+	 * @brief Set configuration variable value
+	 *
+	 * Updates an AI configuration variable at runtime.
+	 * Variable names should be without the 'ai_' prefix.
+	 *
+	 * @param name Variable name (e.g., "nl2sql_enabled")
+	 * @param value New value
+	 * @return true on success, false on failure
+	 *
+	 * Example:
+	 * @code
+	 * GloAI->set_variable("nl2sql_ollama_model", "llama3.3");
+	 * @endcode
+	 */
 	bool set_variable(const char* name, const char* value);
+
+	/**
+	 * @brief Get list of all AI variable names
+	 *
+	 * Returns NULL-terminated array of variable names for admin interface.
+	 *
+	 * @return Array of strings (must be freed by caller)
+	 */
 	char** get_variables_list();

-	// Status reporting
+	/**
+	 * @brief Get AI features status as JSON
+	 *
+	 * Returns comprehensive status including:
+	 * - Enabled features
+	 * - Status counters (requests, cache hits, etc.)
+	 * - Current configuration
+	 * - Daily cloud spend
+	 *
+	 * @return JSON string with status information
+	 */
 	std::string get_status_json();
 };

--- a/include/Anomaly_Detector.h
+++ b/include/Anomaly_Detector.h
@ -1,3 +1,37 @@
+/**
+ * @file anomaly_detector.h
+ * @brief Real-time Anomaly Detection for ProxySQL
+ *
+ * The Anomaly_Detector class provides security threat detection using:
+ * - Embedding-based similarity to known threats
+ * - Statistical outlier detection
+ * - Rule-based pattern matching
+ * - Rate limiting per user/host
+ *
+ * Key Features:
+ * - Multi-stage detection pipeline
+ * - Behavioral profiling and tracking
+ * - Configurable risk thresholds
+ * - Auto-block or log-only modes
+ *
+ * @date 2025-01-16
+ * @version 0.1.0 (stub implementation)
+ *
+ * Example Usage:
+ * @code
+ * Anomaly_Detector* detector = GloAI->get_anomaly_detector();
+ * AnomalyResult result = detector->analyze(
+ *     "SELECT * FROM users",
+ *     "app_user",
+ *     "192.168.1.100",
+ *     "production"
+ * );
+ * if (result.should_block) {
+ *     proxy_warning("Query blocked: %s\n", result.explanation.c_str());
+ * }
+ * @endcode
+ */
+
 #ifndef __CLASS_ANOMALY_DETECTOR_H
 #define __CLASS_ANOMALY_DETECTOR_H

@ -13,6 +47,9 @@ class SQLite3DB;

 /**
 * @brief Anomaly detection result
+ *
+ * Contains the outcome of an anomaly check including risk score,
+ * anomaly type, explanation, and whether to block the query.
 */
 struct AnomalyResult {
 	bool is_anomaly;              ///< True if anomaly detected
--- a/include/NL2SQL_Converter.h
+++ b/include/NL2SQL_Converter.h
@ -1,3 +1,30 @@
+/**
+ * @file nl2sql_converter.h
+ * @brief Natural Language to SQL Converter for ProxySQL
+ *
+ * The NL2SQL_Converter class provides natural language to SQL conversion
+ * using multiple LLM providers (Ollama, OpenAI, Anthropic) with hybrid
+ * deployment and vector-based semantic caching.
+ *
+ * Key Features:
+ * - Multi-provider LLM support (local + cloud)
+ * - Semantic similarity caching using sqlite-vec
+ * - Schema-aware conversion
+ * - Configurable model selection based on latency/budget
+ *
+ * @date 2025-01-16
+ * @version 0.1.0
+ *
+ * Example Usage:
+ * @code
+ * NL2SQLRequest req;
+ * req.natural_language = "Show top 10 customers";
+ * req.schema_name = "sales";
+ * NL2SQLResult result = converter->convert(req);
+ * std::cout << result.sql_query << std::endl;
+ * @endcode
+ */
+
 #ifndef __CLASS_NL2SQL_CONVERTER_H
 #define __CLASS_NL2SQL_CONVERTER_H

@ -12,39 +39,61 @@ class SQLite3DB;

 /**
 * @brief Result structure for NL2SQL conversion
+ *
+ * Contains the generated SQL query along with metadata including
+ * confidence score, explanation, and cache status.
+ *
+ * @note The confidence score is a heuristic based on SQL validation
+ *       and LLM response quality. Actual SQL correctness should be
+ *       verified before execution.
 */
 struct NL2SQLResult {
-	std::string sql_query;                  ///< Generated SQL
-	float confidence;                        ///< 0.0-1.0
-	std::string explanation;                 ///< LLM explanation
-	std::vector<std::string> tables_used;    ///< Tables referenced
-	bool cached;                             ///< From cache
-	int64_t cache_id;                        ///< Cache entry ID
+	std::string sql_query;                  ///< Generated SQL query
+	float confidence;                        ///< Confidence score 0.0-1.0
+	std::string explanation;                 ///< Which model generated this
+	std::vector<std::string> tables_used;    ///< Tables referenced in SQL
+	bool cached;                             ///< True if from semantic cache
+	int64_t cache_id;                        ///< Cache entry ID for tracking

 	NL2SQLResult() : confidence(0.0f), cached(false), cache_id(0) {}
 };

 /**
 * @brief Request structure for NL2SQL conversion
+ *
+ * Contains the natural language query and context for conversion.
+ * Context includes schema name and optional table list for better
+ * SQL generation.
+ *
+ * @note If max_latency_ms is set and < 500ms, the system will prefer
+ *       local Ollama regardless of provider preference.
 */
 struct NL2SQLRequest {
-	std::string natural_language;           ///< Input query
-	std::string schema_name;                 ///< Current schema
-	int max_latency_ms;                      ///< Latency requirement
-	bool allow_cache;                        ///< Check vector cache
-	std::vector<std::string> context_tables; ///< Relevant tables
+	std::string natural_language;           ///< Natural language query text
+	std::string schema_name;                 ///< Current database/schema name
+	int max_latency_ms;                      ///< Max acceptable latency (ms)
+	bool allow_cache;                        ///< Enable semantic cache lookup
+	std::vector<std::string> context_tables; ///< Optional table hints for schema

 	NL2SQLRequest() : max_latency_ms(0), allow_cache(true) {}
 };

 /**
- * @brief Model provider options
+ * @brief Model provider options for NL2SQL conversion
+ *
+ * Defines available LLM providers with different trade-offs:
+ * - LOCAL_OLLAMA: Free, fast, limited model quality
+ * - CLOUD_OPENAI: Paid, slower, high quality
+ * - CLOUD_ANTHROPIC: Paid, slower, high quality
+ *
+ * @note The system automatically falls back to Ollama if cloud
+ *       API keys are not configured.
 */
 enum class ModelProvider {
-	LOCAL_OLLAMA,      ///< Local models via Ollama
-	CLOUD_OPENAI,      ///< OpenAI API
-	CLOUD_ANTHROPIC,   ///< Anthropic API
-	FALLBACK_ERROR     ///< No model available
+	LOCAL_OLLAMA,      ///< Local models via Ollama (default)
+	CLOUD_OPENAI,      ///< OpenAI API (requires API key)
+	CLOUD_ANTHROPIC,   ///< Anthropic API (requires API key)
+	FALLBACK_ERROR     ///< No model available (error state)
 };

 /**
@ -52,6 +101,18 @@ enum class ModelProvider {
 *
 * Converts natural language queries to SQL using LLMs with hybrid
 * local/cloud model support and vector cache.
+ *
+ * Architecture:
+ * - Vector cache for semantic similarity (sqlite-vec)
+ * - Model selection based on latency/budget
+ * - Multi-provider HTTP clients (libcurl)
+ * - Schema-aware prompt building
+ *
+ * Thread Safety:
+ * - This class is NOT thread-safe by itself
+ * - External locking must be provided by AI_Features_Manager
+ *
+ * @see AI_Features_Manager, NL2SQLRequest, NL2SQLResult
 */
 class NL2SQL_Converter {
 private:
@ -82,18 +143,102 @@ private:
 	ModelProvider select_model(const NL2SQLRequest& req);

 public:
+	/**
+	 * @brief Constructor - initializes with default configuration
+	 *
+	 * Sets up default values:
+	 * - query_prefix: "NL2SQL:"
+	 * - model_provider: "ollama"
+	 * - ollama_model: "llama3.2"
+	 * - openai_model: "gpt-4o-mini"
+	 * - anthropic_model: "claude-3-haiku"
+	 * - cache_similarity_threshold: 85
+	 * - timeout_ms: 30000
+	 */
 	NL2SQL_Converter();
+
+	/**
+	 * @brief Destructor - frees allocated resources
+	 */
 	~NL2SQL_Converter();

-	// Initialization
+	/**
+	 * @brief Initialize the NL2SQL converter
+	 *
+	 * Initializes vector DB connection and validates configuration.
+	 * The vector_db will be provided by AI_Features_Manager.
+	 *
+	 * @return 0 on success, non-zero on failure
+	 *
+	 * @note This is a stub implementation for Phase 2.
+	 *       Full vector cache integration is planned for Phase 3.
+	 */
 	int init();
+
+	/**
+	 * @brief Shutdown the NL2SQL converter
+	 *
+	 * Closes vector DB connection and cleans up resources.
+	 */
 	void close();

-	// Main conversion method
+	/**
+	 * @brief Convert natural language query to SQL
+	 *
+	 * This is the main entry point for NL2SQL conversion. The flow is:
+	 * 1. Check vector cache for semantically similar queries
+	 * 2. Build prompt with schema context
+	 * 3. Select appropriate model (Ollama/OpenAI/Anthropic)
+	 * 4. Call LLM API
+	 * 5. Parse and clean SQL response
+	 * 6. Store in vector cache for future use
+	 *
+	 * @param req NL2SQL request containing natural language query and context
+	 * @return NL2SQLResult with generated SQL, confidence score, and metadata
+	 *
+	 * @note This is a synchronous blocking call. For non-blocking behavior,
+	 *       use the async interface via MySQL_Session.
+	 *
+	 * @note The confidence score is heuristic-based. Actual SQL correctness
+	 *       should be verified before execution.
+	 *
+	 * @see NL2SQLRequest, NL2SQLResult, ModelProvider
+	 *
+	 * Example:
+	 * @code
+	 * NL2SQLRequest req;
+	 * req.natural_language = "Find customers with orders > $1000";
+	 * req.allow_cache = true;
+	 * NL2SQLResult result = converter.convert(req);
+	 * if (result.confidence > 0.7f) {
+	 *     execute_sql(result.sql_query);
+	 * }
+	 * @endcode
+	 */
 	NL2SQLResult convert(const NL2SQLRequest& req);

-	// Cache management
+	/**
+	 * @brief Clear the vector cache
+	 *
+	 * Removes all cached NL2SQL conversions from the vector database.
+	 * This is useful for testing or when schema changes significantly.
+	 *
+	 * @note This is a stub implementation for Phase 2.
+	 */
 	void clear_cache();
+
+	/**
+	 * @brief Get cache statistics
+	 *
+	 * Returns JSON string with cache metrics:
+	 * - entries: Total number of cached conversions
+	 * - hits: Number of cache hits
+	 * - misses: Number of cache misses
+	 *
+	 * @return JSON string with cache statistics
+	 *
+	 * @note This is a stub implementation for Phase 2.
+	 */
 	std::string get_cache_stats();
 };

--- a/lib/LLM_Clients.cpp
+++ b/lib/LLM_Clients.cpp
@ -1,3 +1,23 @@
+/**
+ * @file LLM_Clients.cpp
+ * @brief HTTP client implementations for LLM providers
+ *
+ * This file implements HTTP clients for three LLM providers:
+ * - Ollama (local): POST http://localhost:11434/api/generate
+ * - OpenAI (cloud): POST https://api.openai.com/v1/chat/completions
+ * - Anthropic (cloud): POST https://api.anthropic.com/v1/messages
+ *
+ * All clients use libcurl for HTTP requests and nlohmann/json for
+ * request/response parsing. Each client handles:
+ * - Request formatting for the specific API
+ * - Authentication headers
+ * - Response parsing and SQL extraction
+ * - Markdown code block stripping
+ * - Error handling and logging
+ *
+ * @see NL2SQL_Converter.h
+ */
+
 #include "NL2SQL_Converter.h"
 #include "sqlite3db.h"
 #include "proxysql_utils.h"
@ -14,6 +34,18 @@ using json = nlohmann::json;
 // Write callback for curl responses
 // ============================================================================

+/**
+ * @brief libcurl write callback for collecting HTTP response data
+ *
+ * This callback is invoked by libcurl as data arrives.
+ * It appends the received data to a std::string buffer.
+ *
+ * @param contents Pointer to received data
+ * @param size Size of each element
+ * @param nmemb Number of elements
+ * @param userp User pointer (std::string* for response buffer)
+ * @return Total bytes processed
+ */
 static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) {
 	size_t totalSize = size * nmemb;
 	std::string* response = static_cast<std::string*>(userp);
@ -26,10 +58,12 @@ static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* use
 // ============================================================================

 /**
- * @brief Call Ollama API for text generation
+ * @brief Call Ollama API for text generation (local LLM)
 *
 * Ollama endpoint: POST http://localhost:11434/api/generate
+ *
 * Request format:
+ * @code{.json}
 * {
 *   "model": "llama3.2",
 *   "prompt": "Convert to SQL: Show top customers",
@ -39,12 +73,20 @@ static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* use
 *     "num_predict": 500
 *   }
 * }
+ * @endcode
+ *
 * Response format:
+ * @code{.json}
 * {
 *   "response": "SELECT * FROM customers...",
 *   "model": "llama3.2",
 *   "total_duration": 123456789
 * }
+ * @endcode
+ *
+ * @param prompt The prompt to send to Ollama
+ * @param model Model name (e.g., "llama3.2")
+ * @return Generated SQL or empty string on error
 */
 std::string NL2SQL_Converter::call_ollama(const std::string& prompt, const std::string& model) {
 	std::string response_data;
@ -124,10 +166,12 @@ std::string NL2SQL_Converter::call_ollama(const std::string& prompt, const std::
 }

 /**
- * @brief Call OpenAI API for text generation
+ * @brief Call OpenAI API for text generation (cloud LLM)
 *
 * OpenAI endpoint: POST https://api.openai.com/v1/chat/completions
+ *
 * Request format:
+ * @code{.json}
 * {
 *   "model": "gpt-4o-mini",
 *   "messages": [
@ -137,7 +181,10 @@ std::string NL2SQL_Converter::call_ollama(const std::string& prompt, const std::
 *   "temperature": 0.1,
 *   "max_tokens": 500
 * }
+ * @endcode
+ *
 * Response format:
+ * @code{.json}
 * {
 *   "choices": [{
 *     "message": {
@ -148,6 +195,11 @@ std::string NL2SQL_Converter::call_ollama(const std::string& prompt, const std::
 *   }],
 *   "usage": {"total_tokens": 123}
 * }
+ * @endcode
+ *
+ * @param prompt The prompt to send to OpenAI
+ * @param model Model name (e.g., "gpt-4o-mini")
+ * @return Generated SQL or empty string on error
 */
 std::string NL2SQL_Converter::call_openai(const std::string& prompt, const std::string& model) {
 	std::string response_data;
--- a/lib/NL2SQL_Converter.cpp
+++ b/lib/NL2SQL_Converter.cpp
@ -1,3 +1,16 @@
+/**
+ * @file NL2SQL_Converter.cpp
+ * @brief Implementation of Natural Language to SQL Converter
+ *
+ * This file implements the NL2SQL conversion pipeline including:
+ * - Vector cache operations for semantic similarity
+ * - Model selection based on latency/budget
+ * - LLM API calls (Ollama, OpenAI, Anthropic)
+ * - SQL validation and cleaning
+ *
+ * @see NL2SQL_Converter.h
+ */
+
 #include "NL2SQL_Converter.h"
 #include "sqlite3db.h"
 #include "proxysql_utils.h"
@ -12,6 +25,14 @@ using json = nlohmann::json;
 // Global instance is defined elsewhere if needed
 // NL2SQL_Converter *GloNL2SQL = NULL;

+// ============================================================================
+// Constructor/Destructor
+// ============================================================================
+
+/**
+ * Constructor initializes with default configuration values.
+ * The vector_db will be set by AI_Features_Manager during init().
+ */
 NL2SQL_Converter::NL2SQL_Converter() : vector_db(NULL) {
 	config.enabled = true;
 	config.query_prefix = strdup("NL2SQL:");
@ -36,6 +57,14 @@ NL2SQL_Converter::~NL2SQL_Converter() {
 	free(config.anthropic_key);
 }

+// ============================================================================
+// Lifecycle
+// ============================================================================
+
+/**
+ * Initialize the NL2SQL converter.
+ * The vector DB will be provided by AI_Features_Manager during initialization.
+ */
 int NL2SQL_Converter::init() {
 	proxy_info("NL2SQL: Initializing NL2SQL Converter v%s\n", NL2SQL_CONVERTER_VERSION);

@ -187,15 +216,22 @@ std::string NL2SQL_Converter::get_schema_context(const std::vector<std::string>&
 // ============================================================================

 /**
- * @brief Convert natural language to SQL
+ * @brief Convert natural language to SQL (main entry point)
 *
- * This is the main entry point for NL2SQL conversion. The flow is:
+ * Conversion Pipeline:
 * 1. Check vector cache for semantically similar queries
 * 2. Build prompt with schema context
 * 3. Select appropriate model (Ollama/OpenAI/Anthropic)
- * 4. Call LLM API
+ * 4. Call LLM API via HTTP
 * 5. Parse and clean SQL response
 * 6. Store in vector cache for future use
+ *
+ * The confidence score is calculated based on:
+ * - SQL keyword validation (does it look like SQL?)
+ * - Response quality (non-empty, well-formed)
+ * - Default score of 0.85 for valid-looking SQL
+ *
+ * @note This is a synchronous blocking call.
 */
 NL2SQLResult NL2SQL_Converter::convert(const NL2SQLRequest& req) {
 	NL2SQLResult result;