#ifndef CLASS_DISCOVERY_SCHEMA_H #define CLASS_DISCOVERY_SCHEMA_H #ifdef PROXYSQLGENAI #include "sqlite3db.h" #include #include #include #include #include #include "json.hpp" /** * @brief MCP query rule structure * * Action is inferred from rule properties: * - if error_msg != NULL → block * - if replace_pattern != NULL → rewrite * - if timeout_ms > 0 → timeout * - otherwise → allow * * Note: 'hits' is only for in-memory tracking, not persisted to the table. */ struct MCP_Query_Rule { int rule_id; bool active; char *username; char *target_id; char *schemaname; char *tool_name; char *match_pattern; bool negate_match_pattern; int re_modifiers; // bitmask: 1=CASELESS int flagIN; int flagOUT; char *replace_pattern; int timeout_ms; char *error_msg; char *ok_msg; int log; // tri-state: -1=unset(NULL), 0=don't log, 1=log bool apply; char *comment; uint64_t hits; // in-memory only, not persisted to table void* regex_engine; // compiled regex (RE2) MCP_Query_Rule() : rule_id(0), active(false), username(NULL), target_id(NULL), schemaname(NULL), tool_name(NULL), match_pattern(NULL), negate_match_pattern(false), re_modifiers(1), flagIN(0), flagOUT(-1), replace_pattern(NULL), timeout_ms(-1), error_msg(NULL), ok_msg(NULL), log(-1), apply(true), comment(NULL), hits(0), regex_engine(NULL) {} }; /** * @brief MCP query digest statistics */ struct MCP_Query_Digest_Stats { std::string tool_name; int run_id; uint64_t digest; std::string digest_text; unsigned int count_star; time_t first_seen; time_t last_seen; unsigned long long sum_time; unsigned long long min_time; unsigned long long max_time; MCP_Query_Digest_Stats() : run_id(-1), digest(0), count_star(0), first_seen(0), last_seen(0), sum_time(0), min_time(0), max_time(0) {} void add_timing(unsigned long long duration_us, time_t timestamp) { count_star++; sum_time += duration_us; if (duration_us < min_time || min_time == 0) min_time = duration_us; if (duration_us > max_time) max_time = duration_us; if (first_seen == 0) first_seen = timestamp; last_seen = timestamp; } }; /** * @brief MCP query processor output * * This structure collects all possible actions from matching MCP query rules. * A single rule can perform multiple actions simultaneously (rewrite + timeout + block). * Actions are inferred from rule properties: * - if error_msg != NULL → block * - if replace_pattern != NULL → rewrite * - if timeout_ms > 0 → timeout * - if OK_msg != NULL → return OK message * * The calling code checks these fields and performs the appropriate actions. */ struct MCP_Query_Processor_Output { std::string *new_query; // Rewritten query (caller must delete) int timeout_ms; // Query timeout in milliseconds (-1 = not set) char *error_msg; // Error message to return (NULL = not set) char *OK_msg; // OK message to return (NULL = not set) int log; // Whether to log this query (-1 = not set, 0 = no, 1 = yes) int next_query_flagIN; // Flag for next query (-1 = not set) void init() { new_query = NULL; timeout_ms = -1; error_msg = NULL; OK_msg = NULL; log = -1; next_query_flagIN = -1; } void destroy() { if (new_query) { delete new_query; new_query = NULL; } if (error_msg) { free(error_msg); error_msg = NULL; } if (OK_msg) { free(OK_msg); OK_msg = NULL; } } MCP_Query_Processor_Output() { init(); } ~MCP_Query_Processor_Output() { destroy(); } }; /** * @brief Two-Phase Discovery Catalog Schema Manager * * This class manages a comprehensive SQLite catalog for database discovery with two layers: * 1. Deterministic Layer: Static metadata harvested from MySQL INFORMATION_SCHEMA * 2. LLM Agent Layer: Semantic interpretations generated by LLM agents * * Schema separates deterministic metadata (runs, objects, columns, indexes, fks) * from LLM-generated semantics (summaries, domains, metrics, question templates). */ class Discovery_Schema { private: SQLite3DB* db; std::string db_path; // MCP query rules management std::vector mcp_query_rules; pthread_rwlock_t mcp_rules_lock; volatile unsigned int mcp_rules_version; // MCP query digest statistics std::unordered_map> mcp_digest_umap; pthread_rwlock_t mcp_digest_rwlock; /** * @brief Initialize catalog schema with all tables * @return 0 on success, -1 on error */ int init_schema(); /** * @brief Create deterministic layer tables * @return 0 on success, -1 on error */ int create_deterministic_tables(); /** * @brief Create LLM agent layer tables * @return 0 on success, -1 on error */ int create_llm_tables(); /** * @brief Create FTS5 indexes * @return 0 on success, -1 on error */ int create_fts_tables(); public: /** * @brief Constructor * @param path Path to the catalog database file */ Discovery_Schema(const std::string& path); /** * @brief Destructor */ ~Discovery_Schema(); /** * @brief Initialize the catalog database * @return 0 on success, -1 on error */ int init(); /** * @brief Close the catalog database */ void close(); /** * @brief Resolve schema name or run_id to a run_id within a target scope * * If input is a numeric run_id, returns it as-is. * If input is a schema name, finds the latest run_id for that schema. * * @param target_id Required target scope identifier * @param run_id_or_schema Either a numeric run_id or a schema name * @return run_id on success, -1 if schema not found */ int resolve_run_id(const std::string& target_id, const std::string& run_id_or_schema); /** * @brief Create a new discovery run * * @param target_id Logical target identifier that produced this run * @param protocol Backend protocol for this run (mysql|pgsql) * @param source_dsn Data source identifier (e.g., "mysql://host:port/") * @param server_version Backend server version string * @param notes Optional notes for this run * @return run_id on success, -1 on error */ int create_run( const std::string& target_id, const std::string& protocol, const std::string& source_dsn, const std::string& server_version, const std::string& notes = "" ); /** * @brief Finish a discovery run * * @param run_id The run ID to finish * @param notes Optional completion notes * @return 0 on success, -1 on error */ int finish_run(int run_id, const std::string& notes = ""); /** * @brief Get run ID info * * @param run_id The run ID * @return JSON string with run info */ std::string get_run_info(int run_id); /** * @brief Create a new LLM agent run bound to a deterministic run * * @param run_id The deterministic run ID * @param model_name Model name (e.g., "claude-3.5-sonnet") * @param prompt_hash Optional hash of system prompt * @param budget_json Optional budget JSON * @return agent_run_id on success, -1 on error */ int create_agent_run( int run_id, const std::string& model_name, const std::string& prompt_hash = "", const std::string& budget_json = "" ); /** * @brief Finish an agent run * * @param agent_run_id The agent run ID * @param status Status: "success" or "failed" * @param error Optional error message * @return 0 on success, -1 on error */ int finish_agent_run( int agent_run_id, const std::string& status, const std::string& error = "" ); /** * @brief Get the last (most recent) agent_run_id for a given run_id * * @param run_id Run ID * @return agent_run_id on success, 0 if no agent runs exist for this run_id */ int get_last_agent_run_id(int run_id); /** * @brief Insert a schema * * @param run_id Run ID * @param schema_name Schema/database name * @param charset Character set * @param collation Collation * @return schema_id on success, -1 on error */ int insert_schema( int run_id, const std::string& schema_name, const std::string& charset = "", const std::string& collation = "" ); /** * @brief Insert an object (table/view/routine/trigger) * * @param run_id Run ID * @param schema_name Schema name * @param object_name Object name * @param object_type Object type (table/view/routine/trigger) * @param engine Storage engine (for tables) * @param table_rows_est Estimated row count * @param data_length Data length in bytes * @param index_length Index length in bytes * @param create_time Creation time * @param update_time Last update time * @param object_comment Object comment * @param definition_sql Definition SQL (for views/routines) * @return object_id on success, -1 on error */ int insert_object( int run_id, const std::string& schema_name, const std::string& object_name, const std::string& object_type, const std::string& engine = "", long table_rows_est = 0, long data_length = 0, long index_length = 0, const std::string& create_time = "", const std::string& update_time = "", const std::string& object_comment = "", const std::string& definition_sql = "" ); /** * @brief Insert a column * * @param object_id Object ID * @param ordinal_pos Ordinal position * @param column_name Column name * @param data_type Data type * @param column_type Full column type * @param is_nullable Is nullable (0/1) * @param column_default Default value * @param extra Extra info (auto_increment, etc.) * @param charset Character set * @param collation Collation * @param column_comment Column comment * @param is_pk Is primary key (0/1) * @param is_unique Is unique (0/1) * @param is_indexed Is indexed (0/1) * @param is_time Is time type (0/1) * @param is_id_like Is ID-like name (0/1) * @return column_id on success, -1 on error */ int insert_column( int object_id, int ordinal_pos, const std::string& column_name, const std::string& data_type, const std::string& column_type = "", int is_nullable = 1, const std::string& column_default = "", const std::string& extra = "", const std::string& charset = "", const std::string& collation = "", const std::string& column_comment = "", int is_pk = 0, int is_unique = 0, int is_indexed = 0, int is_time = 0, int is_id_like = 0 ); /** * @brief Insert an index * * @param object_id Object ID * @param index_name Index name * @param is_unique Is unique (0/1) * @param is_primary Is primary key (0/1) * @param index_type Index type (BTREE/HASH/FULLTEXT) * @param cardinality Cardinality * @return index_id on success, -1 on error */ int insert_index( int object_id, const std::string& index_name, int is_unique = 0, int is_primary = 0, const std::string& index_type = "", long cardinality = 0 ); /** * @brief Insert an index column * * @param index_id Index ID * @param seq_in_index Sequence in index * @param column_name Column name * @param sub_part Sub-part length * @param collation Collation (A/D) * @return 0 on success, -1 on error */ int insert_index_column( int index_id, int seq_in_index, const std::string& column_name, int sub_part = 0, const std::string& collation = "A" ); /** * @brief Insert a foreign key * * @param run_id Run ID * @param child_object_id Child object ID * @param fk_name FK name * @param parent_schema_name Parent schema name * @param parent_object_name Parent object name * @param on_update ON UPDATE rule * @param on_delete ON DELETE rule * @return fk_id on success, -1 on error */ int insert_foreign_key( int run_id, int child_object_id, const std::string& fk_name, const std::string& parent_schema_name, const std::string& parent_object_name, const std::string& on_update = "", const std::string& on_delete = "" ); /** * @brief Insert a foreign key column * * @param fk_id FK ID * @param seq Sequence number * @param child_column Child column name * @param parent_column Parent column name * @return 0 on success, -1 on error */ int insert_foreign_key_column( int fk_id, int seq, const std::string& child_column, const std::string& parent_column ); /** * @brief Update object derived flags * * Updates has_primary_key, has_foreign_keys, has_time_column flags * based on actual data in columns, indexes, foreign_keys tables. * * @param run_id Run ID * @return 0 on success, -1 on error */ int update_object_flags(int run_id); /** * @brief Insert or update a profile * * @param run_id Run ID * @param object_id Object ID * @param profile_kind Profile kind (table_quick, column, time_range, etc.) * @param profile_json Profile data as JSON string * @return 0 on success, -1 on error */ int upsert_profile( int run_id, int object_id, const std::string& profile_kind, const std::string& profile_json ); /** * @brief Rebuild FTS index for a run * * Deletes and rebuilds the fts_objects index for all objects in a run. * * @param run_id Run ID * @return 0 on success, -1 on error */ int rebuild_fts_index(int run_id); /** * @brief Full-text search over objects * * @param run_id Run ID * @param query FTS5 query * @param limit Max results * @param object_type Optional filter by object type * @param schema_name Optional filter by schema name * @return JSON array of matching objects */ std::string fts_search( int run_id, const std::string& query, int limit = 25, const std::string& object_type = "", const std::string& schema_name = "" ); /** * @brief Get object by ID or key * * @param run_id Run ID * @param object_id Object ID (optional) * @param schema_name Schema name (if using object_key) * @param object_name Object name (if using object_key) * @param include_definition Include view/routine definitions * @param include_profiles Include profile data * @return JSON string with object details */ std::string get_object( int run_id, int object_id = -1, const std::string& schema_name = "", const std::string& object_name = "", bool include_definition = false, bool include_profiles = true ); /** * @brief List objects with pagination * * @param run_id Run ID * @param schema_name Optional schema filter * @param object_type Optional object type filter * @param order_by Order by field (name/rows_est_desc/size_desc) * @param page_size Page size * @param page_token Page token (empty for first page) * @return JSON string with results and next page token */ std::string list_objects( int run_id, const std::string& schema_name = "", const std::string& object_type = "", const std::string& order_by = "name", int page_size = 50, const std::string& page_token = "" ); /** * @brief Get relationships for an object * * Returns foreign keys, view dependencies, and inferred relationships. * * @param run_id Run ID * @param object_id Object ID * @param include_inferred Include LLM-inferred relationships * @param min_confidence Minimum confidence for inferred relationships * @return JSON string with relationships */ std::string get_relationships( int run_id, int object_id, bool include_inferred = true, double min_confidence = 0.0 ); /** * @brief Append an agent event * * @param agent_run_id Agent run ID * @param event_type Event type (tool_call/tool_result/note/decision) * @param payload_json Event payload as JSON string * @return event_id on success, -1 on error */ int append_agent_event( int agent_run_id, const std::string& event_type, const std::string& payload_json ); /** * @brief Upsert an LLM object summary * * @param agent_run_id Agent run ID * @param run_id Deterministic run ID * @param object_id Object ID * @param summary_json Summary data as JSON string * @param confidence Confidence score (0.0-1.0) * @param status Status (draft/validated/stable) * @param sources_json Optional sources evidence * @return 0 on success, -1 on error */ int upsert_llm_summary( int agent_run_id, int run_id, int object_id, const std::string& summary_json, double confidence = 0.5, const std::string& status = "draft", const std::string& sources_json = "" ); /** * @brief Get LLM summary for an object * * @param run_id Run ID * @param object_id Object ID * @param agent_run_id Optional specific agent run ID * @param latest Get latest summary across all agent runs * @return JSON string with summary or null */ std::string get_llm_summary( int run_id, int object_id, int agent_run_id = -1, bool latest = true ); /** * @brief Upsert an LLM-inferred relationship * * @param agent_run_id Agent run ID * @param run_id Deterministic run ID * @param child_object_id Child object ID * @param child_column Child column name * @param parent_object_id Parent object ID * @param parent_column Parent column name * @param rel_type Relationship type (fk_like/bridge/polymorphic/etc) * @param confidence Confidence score * @param evidence_json Evidence JSON string * @return 0 on success, -1 on error */ int upsert_llm_relationship( int agent_run_id, int run_id, int child_object_id, const std::string& child_column, int parent_object_id, const std::string& parent_column, const std::string& rel_type = "fk_like", double confidence = 0.6, const std::string& evidence_json = "" ); /** * @brief Upsert a domain * * @param agent_run_id Agent run ID * @param run_id Deterministic run ID * @param domain_key Domain key (e.g., "billing", "sales") * @param title Domain title * @param description Domain description * @param confidence Confidence score * @return domain_id on success, -1 on error */ int upsert_llm_domain( int agent_run_id, int run_id, const std::string& domain_key, const std::string& title = "", const std::string& description = "", double confidence = 0.6 ); /** * @brief Set domain members * * Replaces all members of a domain with the provided list. * * @param agent_run_id Agent run ID * @param run_id Deterministic run ID * @param domain_key Domain key * @param members_json Members JSON array with object_id, role, confidence * @return 0 on success, -1 on error */ int set_domain_members( int agent_run_id, int run_id, const std::string& domain_key, const std::string& members_json ); /** * @brief Upsert a metric * * @param agent_run_id Agent run ID * @param run_id Deterministic run ID * @param metric_key Metric key (e.g., "orders.count") * @param title Metric title * @param description Metric description * @param domain_key Optional domain key * @param grain Grain (day/order/customer/etc) * @param unit Unit (USD/count/ms/etc) * @param sql_template Optional SQL template * @param depends_json Optional dependencies JSON * @param confidence Confidence score * @return metric_id on success, -1 on error */ int upsert_llm_metric( int agent_run_id, int run_id, const std::string& metric_key, const std::string& title, const std::string& description = "", const std::string& domain_key = "", const std::string& grain = "", const std::string& unit = "", const std::string& sql_template = "", const std::string& depends_json = "", double confidence = 0.6 ); /** * @brief Add a question template * * @param agent_run_id Agent run ID * @param run_id Deterministic run ID * @param title Template title * @param question_nl Natural language question * @param template_json Query plan template JSON * @param example_sql Optional example SQL * @param related_objects JSON array of related object names (tables/views) * @param confidence Confidence score * @return template_id on success, -1 on error */ int add_question_template( int agent_run_id, int run_id, const std::string& title, const std::string& question_nl, const std::string& template_json, const std::string& example_sql = "", const std::string& related_objects = "", double confidence = 0.6 ); /** * @brief Add an LLM note * * @param agent_run_id Agent run ID * @param run_id Deterministic run ID * @param scope Note scope (global/schema/object/domain) * @param object_id Optional object ID * @param domain_key Optional domain key * @param title Note title * @param body Note body * @param tags_json Optional tags JSON array * @return note_id on success, -1 on error */ int add_llm_note( int agent_run_id, int run_id, const std::string& scope, int object_id = -1, const std::string& domain_key = "", const std::string& title = "", const std::string& body = "", const std::string& tags_json = "" ); /** * @brief Full-text search over LLM artifacts * * @param run_id Run ID * @param query FTS query (empty to list all) * @param limit Max results * @param include_objects Include full object details for question templates * @return JSON array of matching LLM artifacts with example_sql and related_objects */ std::string fts_search_llm( int run_id, const std::string& query, int limit = 25, bool include_objects = false ); /** * @brief Log an LLM search query * * @param run_id Run ID * @param query Search query string * @param lmt Result limit * @return 0 on success, -1 on error */ int log_llm_search( int run_id, const std::string& query, int lmt = 25 ); /** * @brief Log RAG FTS search operation * * Logs RAG full-text search operations to the rag_search_log table. * Similar to log_llm_search() but for RAG searches. * * @param query Search query string * @param k Number of results requested * @param filters JSON string of filters applied (empty if none) * @return 0 on success, -1 on error */ int log_rag_search_fts( const std::string& query, int k, const std::string& filters = "" ); /** * @brief Log MCP tool invocation via /mcp/query/ endpoint * @param tool_name Name of the tool that was called * @param schema Schema name (empty if not applicable) * @param run_id Run ID (0 or -1 if not applicable) * @param start_time Start monotonic time (microseconds) * @param execution_time Execution duration (microseconds) * @param error Error message (empty if success) * @return 0 on success, -1 on error */ int log_query_tool_call( const std::string& tool_name, const std::string& schema, int run_id, unsigned long long start_time, unsigned long long execution_time, const std::string& error ); /** * @brief Get database handle for direct access * @return SQLite3DB pointer */ SQLite3DB* get_db() { return db; } /** * @brief Get the database file path * @return Database file path */ std::string get_db_path() const { return db_path; } // ============================================================ // MCP QUERY RULES // ============================================================ /** * @brief Load MCP query rules from SQLite */ void load_mcp_query_rules(SQLite3_result* resultset); /** * @brief Evaluate MCP query rules for a tool invocation * @return MCP_Query_Processor_Output object populated with actions from matching rules * Caller is responsible for destroying the returned object. */ MCP_Query_Processor_Output* evaluate_mcp_query_rules( const std::string& tool_name, const std::string& username, const std::string& target_id, const std::string& schemaname, const nlohmann::json& arguments, const std::string& original_query ); /** * @brief Get current MCP query rules as resultset */ SQLite3_result* get_mcp_query_rules(); /** * @brief Get stats for MCP query rules (hits per rule) */ SQLite3_result* get_stats_mcp_query_rules(); // ============================================================ // MCP QUERY DIGEST // ============================================================ /** * @brief Update MCP query digest statistics */ void update_mcp_query_digest( const std::string& tool_name, int run_id, uint64_t digest, const std::string& digest_text, unsigned long long duration_us, time_t timestamp ); /** * @brief Get MCP query digest statistics * @param reset If true, reset stats after retrieval */ SQLite3_result* get_mcp_query_digest(bool reset = false); /** * @brief Compute MCP query digest hash using SpookyHash */ static uint64_t compute_mcp_digest( const std::string& tool_name, const nlohmann::json& arguments ); /** * @brief Fingerprint MCP query arguments (replace literals with ?) */ static std::string fingerprint_mcp_args(const nlohmann::json& arguments); }; #endif /* PROXYSQLGENAI */ #endif /* CLASS_DISCOVERY_SCHEMA_H */