You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
proxysql/include/Discovery_Schema.h

914 lines
24 KiB

#ifndef CLASS_DISCOVERY_SCHEMA_H
#define CLASS_DISCOVERY_SCHEMA_H
#ifdef PROXYSQLGENAI
#include "sqlite3db.h"
#include <string>
#include <vector>
#include <memory>
#include <pthread.h>
#include <unordered_map>
#include "json.hpp"
/**
* @brief MCP query rule structure
*
* Action is inferred from rule properties:
* - if error_msg != NULL → block
* - if replace_pattern != NULL → rewrite
* - if timeout_ms > 0 → timeout
* - otherwise → allow
*
* Note: 'hits' is only for in-memory tracking, not persisted to the table.
*/
struct MCP_Query_Rule {
int rule_id;
bool active;
char *username;
char *target_id;
char *schemaname;
char *tool_name;
char *match_pattern;
bool negate_match_pattern;
int re_modifiers; // bitmask: 1=CASELESS
int flagIN;
int flagOUT;
char *replace_pattern;
int timeout_ms;
char *error_msg;
char *ok_msg;
bool log;
bool apply;
char *comment;
uint64_t hits; // in-memory only, not persisted to table
void* regex_engine; // compiled regex (RE2)
MCP_Query_Rule() : rule_id(0), active(false), username(NULL), target_id(NULL), schemaname(NULL),
tool_name(NULL), match_pattern(NULL), negate_match_pattern(false),
re_modifiers(1), flagIN(0), flagOUT(0), replace_pattern(NULL),
timeout_ms(0), error_msg(NULL), ok_msg(NULL), log(false), apply(true),
comment(NULL), hits(0), regex_engine(NULL) {}
};
/**
* @brief MCP query digest statistics
*/
struct MCP_Query_Digest_Stats {
std::string tool_name;
int run_id;
uint64_t digest;
std::string digest_text;
unsigned int count_star;
time_t first_seen;
time_t last_seen;
unsigned long long sum_time;
unsigned long long min_time;
unsigned long long max_time;
MCP_Query_Digest_Stats() : run_id(-1), digest(0), count_star(0),
first_seen(0), last_seen(0),
sum_time(0), min_time(0), max_time(0) {}
void add_timing(unsigned long long duration_us, time_t timestamp) {
count_star++;
sum_time += duration_us;
if (duration_us < min_time || min_time == 0) min_time = duration_us;
if (duration_us > max_time) max_time = duration_us;
if (first_seen == 0) first_seen = timestamp;
last_seen = timestamp;
}
};
/**
* @brief MCP query processor output
*
* This structure collects all possible actions from matching MCP query rules.
* A single rule can perform multiple actions simultaneously (rewrite + timeout + block).
* Actions are inferred from rule properties:
* - if error_msg != NULL → block
* - if replace_pattern != NULL → rewrite
* - if timeout_ms > 0 → timeout
* - if OK_msg != NULL → return OK message
*
* The calling code checks these fields and performs the appropriate actions.
*/
struct MCP_Query_Processor_Output {
std::string *new_query; // Rewritten query (caller must delete)
int timeout_ms; // Query timeout in milliseconds (-1 = not set)
char *error_msg; // Error message to return (NULL = not set)
char *OK_msg; // OK message to return (NULL = not set)
int log; // Whether to log this query (-1 = not set, 0 = no, 1 = yes)
int next_query_flagIN; // Flag for next query (-1 = not set)
void init() {
new_query = NULL;
timeout_ms = -1;
error_msg = NULL;
OK_msg = NULL;
log = -1;
next_query_flagIN = -1;
}
void destroy() {
if (new_query) {
delete new_query;
new_query = NULL;
}
if (error_msg) {
free(error_msg);
error_msg = NULL;
}
if (OK_msg) {
free(OK_msg);
OK_msg = NULL;
}
}
MCP_Query_Processor_Output() {
init();
}
~MCP_Query_Processor_Output() {
destroy();
}
};
/**
* @brief Two-Phase Discovery Catalog Schema Manager
*
* This class manages a comprehensive SQLite catalog for database discovery with two layers:
* 1. Deterministic Layer: Static metadata harvested from MySQL INFORMATION_SCHEMA
* 2. LLM Agent Layer: Semantic interpretations generated by LLM agents
*
* Schema separates deterministic metadata (runs, objects, columns, indexes, fks)
* from LLM-generated semantics (summaries, domains, metrics, question templates).
*/
class Discovery_Schema {
private:
SQLite3DB* db;
std::string db_path;
// MCP query rules management
std::vector<MCP_Query_Rule*> mcp_query_rules;
pthread_rwlock_t mcp_rules_lock;
volatile unsigned int mcp_rules_version;
// MCP query digest statistics
std::unordered_map<std::string, std::unordered_map<uint64_t, void*>> mcp_digest_umap;
pthread_rwlock_t mcp_digest_rwlock;
/**
* @brief Initialize catalog schema with all tables
* @return 0 on success, -1 on error
*/
int init_schema();
/**
* @brief Create deterministic layer tables
* @return 0 on success, -1 on error
*/
int create_deterministic_tables();
/**
* @brief Create LLM agent layer tables
* @return 0 on success, -1 on error
*/
int create_llm_tables();
/**
* @brief Create FTS5 indexes
* @return 0 on success, -1 on error
*/
int create_fts_tables();
public:
/**
* @brief Constructor
* @param path Path to the catalog database file
*/
Discovery_Schema(const std::string& path);
/**
* @brief Destructor
*/
~Discovery_Schema();
/**
* @brief Initialize the catalog database
* @return 0 on success, -1 on error
*/
int init();
/**
* @brief Close the catalog database
*/
void close();
/**
* @brief Resolve schema name or run_id to a run_id within a target scope
*
* If input is a numeric run_id, returns it as-is.
* If input is a schema name, finds the latest run_id for that schema.
*
* @param target_id Required target scope identifier
* @param run_id_or_schema Either a numeric run_id or a schema name
* @return run_id on success, -1 if schema not found
*/
int resolve_run_id(const std::string& target_id, const std::string& run_id_or_schema);
/**
* @brief Create a new discovery run
*
* @param target_id Logical target identifier that produced this run
* @param protocol Backend protocol for this run (mysql|pgsql)
* @param source_dsn Data source identifier (e.g., "mysql://host:port/")
* @param server_version Backend server version string
* @param notes Optional notes for this run
* @return run_id on success, -1 on error
*/
int create_run(
const std::string& target_id,
const std::string& protocol,
const std::string& source_dsn,
const std::string& server_version,
const std::string& notes = ""
);
/**
* @brief Finish a discovery run
*
* @param run_id The run ID to finish
* @param notes Optional completion notes
* @return 0 on success, -1 on error
*/
int finish_run(int run_id, const std::string& notes = "");
/**
* @brief Get run ID info
*
* @param run_id The run ID
* @return JSON string with run info
*/
std::string get_run_info(int run_id);
/**
* @brief Create a new LLM agent run bound to a deterministic run
*
* @param run_id The deterministic run ID
* @param model_name Model name (e.g., "claude-3.5-sonnet")
* @param prompt_hash Optional hash of system prompt
* @param budget_json Optional budget JSON
* @return agent_run_id on success, -1 on error
*/
int create_agent_run(
int run_id,
const std::string& model_name,
const std::string& prompt_hash = "",
const std::string& budget_json = ""
);
/**
* @brief Finish an agent run
*
* @param agent_run_id The agent run ID
* @param status Status: "success" or "failed"
* @param error Optional error message
* @return 0 on success, -1 on error
*/
int finish_agent_run(
int agent_run_id,
const std::string& status,
const std::string& error = ""
);
/**
* @brief Get the last (most recent) agent_run_id for a given run_id
*
* @param run_id Run ID
* @return agent_run_id on success, 0 if no agent runs exist for this run_id
*/
int get_last_agent_run_id(int run_id);
/**
* @brief Insert a schema
*
* @param run_id Run ID
* @param schema_name Schema/database name
* @param charset Character set
* @param collation Collation
* @return schema_id on success, -1 on error
*/
int insert_schema(
int run_id,
const std::string& schema_name,
const std::string& charset = "",
const std::string& collation = ""
);
/**
* @brief Insert an object (table/view/routine/trigger)
*
* @param run_id Run ID
* @param schema_name Schema name
* @param object_name Object name
* @param object_type Object type (table/view/routine/trigger)
* @param engine Storage engine (for tables)
* @param table_rows_est Estimated row count
* @param data_length Data length in bytes
* @param index_length Index length in bytes
* @param create_time Creation time
* @param update_time Last update time
* @param object_comment Object comment
* @param definition_sql Definition SQL (for views/routines)
* @return object_id on success, -1 on error
*/
int insert_object(
int run_id,
const std::string& schema_name,
const std::string& object_name,
const std::string& object_type,
const std::string& engine = "",
long table_rows_est = 0,
long data_length = 0,
long index_length = 0,
const std::string& create_time = "",
const std::string& update_time = "",
const std::string& object_comment = "",
const std::string& definition_sql = ""
);
/**
* @brief Insert a column
*
* @param object_id Object ID
* @param ordinal_pos Ordinal position
* @param column_name Column name
* @param data_type Data type
* @param column_type Full column type
* @param is_nullable Is nullable (0/1)
* @param column_default Default value
* @param extra Extra info (auto_increment, etc.)
* @param charset Character set
* @param collation Collation
* @param column_comment Column comment
* @param is_pk Is primary key (0/1)
* @param is_unique Is unique (0/1)
* @param is_indexed Is indexed (0/1)
* @param is_time Is time type (0/1)
* @param is_id_like Is ID-like name (0/1)
* @return column_id on success, -1 on error
*/
int insert_column(
int object_id,
int ordinal_pos,
const std::string& column_name,
const std::string& data_type,
const std::string& column_type = "",
int is_nullable = 1,
const std::string& column_default = "",
const std::string& extra = "",
const std::string& charset = "",
const std::string& collation = "",
const std::string& column_comment = "",
int is_pk = 0,
int is_unique = 0,
int is_indexed = 0,
int is_time = 0,
int is_id_like = 0
);
/**
* @brief Insert an index
*
* @param object_id Object ID
* @param index_name Index name
* @param is_unique Is unique (0/1)
* @param is_primary Is primary key (0/1)
* @param index_type Index type (BTREE/HASH/FULLTEXT)
* @param cardinality Cardinality
* @return index_id on success, -1 on error
*/
int insert_index(
int object_id,
const std::string& index_name,
int is_unique = 0,
int is_primary = 0,
const std::string& index_type = "",
long cardinality = 0
);
/**
* @brief Insert an index column
*
* @param index_id Index ID
* @param seq_in_index Sequence in index
* @param column_name Column name
* @param sub_part Sub-part length
* @param collation Collation (A/D)
* @return 0 on success, -1 on error
*/
int insert_index_column(
int index_id,
int seq_in_index,
const std::string& column_name,
int sub_part = 0,
const std::string& collation = "A"
);
/**
* @brief Insert a foreign key
*
* @param run_id Run ID
* @param child_object_id Child object ID
* @param fk_name FK name
* @param parent_schema_name Parent schema name
* @param parent_object_name Parent object name
* @param on_update ON UPDATE rule
* @param on_delete ON DELETE rule
* @return fk_id on success, -1 on error
*/
int insert_foreign_key(
int run_id,
int child_object_id,
const std::string& fk_name,
const std::string& parent_schema_name,
const std::string& parent_object_name,
const std::string& on_update = "",
const std::string& on_delete = ""
);
/**
* @brief Insert a foreign key column
*
* @param fk_id FK ID
* @param seq Sequence number
* @param child_column Child column name
* @param parent_column Parent column name
* @return 0 on success, -1 on error
*/
int insert_foreign_key_column(
int fk_id,
int seq,
const std::string& child_column,
const std::string& parent_column
);
/**
* @brief Update object derived flags
*
* Updates has_primary_key, has_foreign_keys, has_time_column flags
* based on actual data in columns, indexes, foreign_keys tables.
*
* @param run_id Run ID
* @return 0 on success, -1 on error
*/
int update_object_flags(int run_id);
/**
* @brief Insert or update a profile
*
* @param run_id Run ID
* @param object_id Object ID
* @param profile_kind Profile kind (table_quick, column, time_range, etc.)
* @param profile_json Profile data as JSON string
* @return 0 on success, -1 on error
*/
int upsert_profile(
int run_id,
int object_id,
const std::string& profile_kind,
const std::string& profile_json
);
/**
* @brief Rebuild FTS index for a run
*
* Deletes and rebuilds the fts_objects index for all objects in a run.
*
* @param run_id Run ID
* @return 0 on success, -1 on error
*/
int rebuild_fts_index(int run_id);
/**
* @brief Full-text search over objects
*
* @param run_id Run ID
* @param query FTS5 query
* @param limit Max results
* @param object_type Optional filter by object type
* @param schema_name Optional filter by schema name
* @return JSON array of matching objects
*/
std::string fts_search(
int run_id,
const std::string& query,
int limit = 25,
const std::string& object_type = "",
const std::string& schema_name = ""
);
/**
* @brief Get object by ID or key
*
* @param run_id Run ID
* @param object_id Object ID (optional)
* @param schema_name Schema name (if using object_key)
* @param object_name Object name (if using object_key)
* @param include_definition Include view/routine definitions
* @param include_profiles Include profile data
* @return JSON string with object details
*/
std::string get_object(
int run_id,
int object_id = -1,
const std::string& schema_name = "",
const std::string& object_name = "",
bool include_definition = false,
bool include_profiles = true
);
/**
* @brief List objects with pagination
*
* @param run_id Run ID
* @param schema_name Optional schema filter
* @param object_type Optional object type filter
* @param order_by Order by field (name/rows_est_desc/size_desc)
* @param page_size Page size
* @param page_token Page token (empty for first page)
* @return JSON string with results and next page token
*/
std::string list_objects(
int run_id,
const std::string& schema_name = "",
const std::string& object_type = "",
const std::string& order_by = "name",
int page_size = 50,
const std::string& page_token = ""
);
/**
* @brief Get relationships for an object
*
* Returns foreign keys, view dependencies, and inferred relationships.
*
* @param run_id Run ID
* @param object_id Object ID
* @param include_inferred Include LLM-inferred relationships
* @param min_confidence Minimum confidence for inferred relationships
* @return JSON string with relationships
*/
std::string get_relationships(
int run_id,
int object_id,
bool include_inferred = true,
double min_confidence = 0.0
);
/**
* @brief Append an agent event
*
* @param agent_run_id Agent run ID
* @param event_type Event type (tool_call/tool_result/note/decision)
* @param payload_json Event payload as JSON string
* @return event_id on success, -1 on error
*/
int append_agent_event(
int agent_run_id,
const std::string& event_type,
const std::string& payload_json
);
/**
* @brief Upsert an LLM object summary
*
* @param agent_run_id Agent run ID
* @param run_id Deterministic run ID
* @param object_id Object ID
* @param summary_json Summary data as JSON string
* @param confidence Confidence score (0.0-1.0)
* @param status Status (draft/validated/stable)
* @param sources_json Optional sources evidence
* @return 0 on success, -1 on error
*/
int upsert_llm_summary(
int agent_run_id,
int run_id,
int object_id,
const std::string& summary_json,
double confidence = 0.5,
const std::string& status = "draft",
const std::string& sources_json = ""
);
/**
* @brief Get LLM summary for an object
*
* @param run_id Run ID
* @param object_id Object ID
* @param agent_run_id Optional specific agent run ID
* @param latest Get latest summary across all agent runs
* @return JSON string with summary or null
*/
std::string get_llm_summary(
int run_id,
int object_id,
int agent_run_id = -1,
bool latest = true
);
/**
* @brief Upsert an LLM-inferred relationship
*
* @param agent_run_id Agent run ID
* @param run_id Deterministic run ID
* @param child_object_id Child object ID
* @param child_column Child column name
* @param parent_object_id Parent object ID
* @param parent_column Parent column name
* @param rel_type Relationship type (fk_like/bridge/polymorphic/etc)
* @param confidence Confidence score
* @param evidence_json Evidence JSON string
* @return 0 on success, -1 on error
*/
int upsert_llm_relationship(
int agent_run_id,
int run_id,
int child_object_id,
const std::string& child_column,
int parent_object_id,
const std::string& parent_column,
const std::string& rel_type = "fk_like",
double confidence = 0.6,
const std::string& evidence_json = ""
);
/**
* @brief Upsert a domain
*
* @param agent_run_id Agent run ID
* @param run_id Deterministic run ID
* @param domain_key Domain key (e.g., "billing", "sales")
* @param title Domain title
* @param description Domain description
* @param confidence Confidence score
* @return domain_id on success, -1 on error
*/
int upsert_llm_domain(
int agent_run_id,
int run_id,
const std::string& domain_key,
const std::string& title = "",
const std::string& description = "",
double confidence = 0.6
);
/**
* @brief Set domain members
*
* Replaces all members of a domain with the provided list.
*
* @param agent_run_id Agent run ID
* @param run_id Deterministic run ID
* @param domain_key Domain key
* @param members_json Members JSON array with object_id, role, confidence
* @return 0 on success, -1 on error
*/
int set_domain_members(
int agent_run_id,
int run_id,
const std::string& domain_key,
const std::string& members_json
);
/**
* @brief Upsert a metric
*
* @param agent_run_id Agent run ID
* @param run_id Deterministic run ID
* @param metric_key Metric key (e.g., "orders.count")
* @param title Metric title
* @param description Metric description
* @param domain_key Optional domain key
* @param grain Grain (day/order/customer/etc)
* @param unit Unit (USD/count/ms/etc)
* @param sql_template Optional SQL template
* @param depends_json Optional dependencies JSON
* @param confidence Confidence score
* @return metric_id on success, -1 on error
*/
int upsert_llm_metric(
int agent_run_id,
int run_id,
const std::string& metric_key,
const std::string& title,
const std::string& description = "",
const std::string& domain_key = "",
const std::string& grain = "",
const std::string& unit = "",
const std::string& sql_template = "",
const std::string& depends_json = "",
double confidence = 0.6
);
/**
* @brief Add a question template
*
* @param agent_run_id Agent run ID
* @param run_id Deterministic run ID
* @param title Template title
* @param question_nl Natural language question
* @param template_json Query plan template JSON
* @param example_sql Optional example SQL
* @param related_objects JSON array of related object names (tables/views)
* @param confidence Confidence score
* @return template_id on success, -1 on error
*/
int add_question_template(
int agent_run_id,
int run_id,
const std::string& title,
const std::string& question_nl,
const std::string& template_json,
const std::string& example_sql = "",
const std::string& related_objects = "",
double confidence = 0.6
);
/**
* @brief Add an LLM note
*
* @param agent_run_id Agent run ID
* @param run_id Deterministic run ID
* @param scope Note scope (global/schema/object/domain)
* @param object_id Optional object ID
* @param domain_key Optional domain key
* @param title Note title
* @param body Note body
* @param tags_json Optional tags JSON array
* @return note_id on success, -1 on error
*/
int add_llm_note(
int agent_run_id,
int run_id,
const std::string& scope,
int object_id = -1,
const std::string& domain_key = "",
const std::string& title = "",
const std::string& body = "",
const std::string& tags_json = ""
);
/**
* @brief Full-text search over LLM artifacts
*
* @param run_id Run ID
* @param query FTS query (empty to list all)
* @param limit Max results
* @param include_objects Include full object details for question templates
* @return JSON array of matching LLM artifacts with example_sql and related_objects
*/
std::string fts_search_llm(
int run_id,
const std::string& query,
int limit = 25,
bool include_objects = false
);
/**
* @brief Log an LLM search query
*
* @param run_id Run ID
* @param query Search query string
* @param lmt Result limit
* @return 0 on success, -1 on error
*/
int log_llm_search(
int run_id,
const std::string& query,
int lmt = 25
);
/**
* @brief Log RAG FTS search operation
*
* Logs RAG full-text search operations to the rag_search_log table.
* Similar to log_llm_search() but for RAG searches.
*
* @param query Search query string
* @param k Number of results requested
* @param filters JSON string of filters applied (empty if none)
* @return 0 on success, -1 on error
*/
int log_rag_search_fts(
const std::string& query,
int k,
const std::string& filters = ""
);
/**
* @brief Log MCP tool invocation via /mcp/query/ endpoint
* @param tool_name Name of the tool that was called
* @param schema Schema name (empty if not applicable)
* @param run_id Run ID (0 or -1 if not applicable)
* @param start_time Start monotonic time (microseconds)
* @param execution_time Execution duration (microseconds)
* @param error Error message (empty if success)
* @return 0 on success, -1 on error
*/
int log_query_tool_call(
const std::string& tool_name,
const std::string& schema,
int run_id,
unsigned long long start_time,
unsigned long long execution_time,
const std::string& error
);
/**
* @brief Get database handle for direct access
* @return SQLite3DB pointer
*/
SQLite3DB* get_db() { return db; }
/**
* @brief Get the database file path
* @return Database file path
*/
std::string get_db_path() const { return db_path; }
// ============================================================
// MCP QUERY RULES
// ============================================================
/**
* @brief Load MCP query rules from SQLite
*/
void load_mcp_query_rules(SQLite3_result* resultset);
/**
* @brief Evaluate MCP query rules for a tool invocation
* @return MCP_Query_Processor_Output object populated with actions from matching rules
* Caller is responsible for destroying the returned object.
*/
MCP_Query_Processor_Output* evaluate_mcp_query_rules(
const std::string& tool_name,
const std::string& username,
const std::string& target_id,
const std::string& schemaname,
const nlohmann::json& arguments,
const std::string& original_query
);
/**
* @brief Get current MCP query rules as resultset
*/
SQLite3_result* get_mcp_query_rules();
/**
* @brief Get stats for MCP query rules (hits per rule)
*/
SQLite3_result* get_stats_mcp_query_rules();
// ============================================================
// MCP QUERY DIGEST
// ============================================================
/**
* @brief Update MCP query digest statistics
*/
void update_mcp_query_digest(
const std::string& tool_name,
int run_id,
uint64_t digest,
const std::string& digest_text,
unsigned long long duration_us,
time_t timestamp
);
/**
* @brief Get MCP query digest statistics
* @param reset If true, reset stats after retrieval
*/
SQLite3_result* get_mcp_query_digest(bool reset = false);
/**
* @brief Compute MCP query digest hash using SpookyHash
*/
static uint64_t compute_mcp_digest(
const std::string& tool_name,
const nlohmann::json& arguments
);
/**
* @brief Fingerprint MCP query arguments (replace literals with ?)
*/
static std::string fingerprint_mcp_args(const nlohmann::json& arguments);
};
#endif /* PROXYSQLGENAI */
#endif /* CLASS_DISCOVERY_SCHEMA_H */