You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
proxysql/lib/Static_Harvester.cpp

1425 lines
43 KiB

#ifdef PROXYSQLGENAI
#include "proxysql.h"
// ============================================================
// Static_Harvester Implementation
//
// Static metadata harvester for MySQL databases. This class performs
// deterministic metadata extraction from MySQL's INFORMATION_SCHEMA
// and stores it in a Discovery_Schema catalog for use by MCP tools.
//
// Harvest stages (executed in order by run_full_harvest):
// 1. Schemas/Databases - From information_schema.SCHEMATA
// 2. Objects - Tables, views, routines from TABLES and ROUTINES
// 3. Columns - From COLUMNS with derived hints (is_time, is_id_like)
// 4. Indexes - From STATISTICS with is_pk, is_unique, is_indexed flags
// 5. Foreign Keys - From KEY_COLUMN_USAGE and REFERENTIAL_CONSTRAINTS
// 6. View Definitions - From VIEWS
// 7. Quick Profiles - Metadata-based table kind inference (log/event, fact, entity)
// 8. FTS Index Rebuild - Full-text search index for object discovery
// ============================================================
#include "Static_Harvester.h"
#include "proxysql_debug.h"
#include <sstream>
#include <algorithm>
#include <regex>
#include <cstring>
// MySQL client library
#include <mysql.h>
// JSON library
#include "../deps/json/json.hpp"
using json = nlohmann::json;
// ============================================================
// Constructor / Destructor
// ============================================================
// Initialize Static_Harvester with MySQL connection parameters.
//
// Parameters:
// host - MySQL server hostname or IP address
// port - MySQL server port number
// user - MySQL username for authentication
// password - MySQL password for authentication
// schema - Default schema (can be empty for all schemas)
// catalog_path - Filesystem path to the SQLite catalog database
//
// Notes:
// - Creates a new Discovery_Schema instance for catalog storage
// - Initializes the connection mutex but does NOT connect to MySQL yet
// - Call init() after construction to initialize the catalog
// - MySQL connection is established lazily on first harvest operation
Static_Harvester::Static_Harvester(
const std::string& host,
int port,
const std::string& user,
const std::string& password,
const std::string& schema,
const std::string& catalog_path
)
: mysql_host(host),
mysql_port(port),
mysql_user(user),
mysql_password(password),
mysql_schema(schema),
mysql_conn(NULL),
catalog(NULL),
current_run_id(-1)
{
pthread_mutex_init(&conn_lock, NULL);
catalog = new Discovery_Schema(catalog_path);
}
// Destroy Static_Harvester and release resources.
//
// Ensures MySQL connection is closed and the Discovery_Schema catalog
// is properly deleted. Connection mutex is destroyed.
Static_Harvester::~Static_Harvester() {
close();
if (catalog) {
delete catalog;
}
pthread_mutex_destroy(&conn_lock);
}
// ============================================================
// Lifecycle Methods
// ============================================================
// Initialize the harvester by initializing the catalog database.
//
// This must be called after construction before any harvest operations.
// Initializes the Discovery_Schema SQLite database, creating tables
// if they don't exist.
//
// Returns:
// 0 on success, -1 on error
int Static_Harvester::init() {
if (catalog->init()) {
proxy_error("Static_Harvester: Failed to initialize catalog\n");
return -1;
}
return 0;
}
// Close the MySQL connection and cleanup resources.
//
// Disconnects from MySQL if connected. The catalog is NOT destroyed,
// allowing multiple harvest runs with the same harvester instance.
void Static_Harvester::close() {
disconnect_mysql();
}
// ============================================================
// MySQL Connection Methods
// ============================================================
// Establish connection to the MySQL server.
//
// Connects to MySQL using the credentials provided during construction.
// If already connected, returns 0 immediately (idempotent).
//
// Connection settings:
// - 30 second connect/read/write timeouts
// - CLIENT_MULTI_STATEMENTS flag enabled
// - No default database selected (we query information_schema)
//
// On successful connection, also retrieves the MySQL server version
// and builds the source DSN string for run tracking.
//
// Thread Safety:
// Uses mutex to ensure thread-safe connection establishment.
//
// Returns:
// 0 on success (including already connected), -1 on error
int Static_Harvester::connect_mysql() {
pthread_mutex_lock(&conn_lock);
if (mysql_conn) {
pthread_mutex_unlock(&conn_lock);
return 0; // Already connected
}
mysql_conn = mysql_init(NULL);
if (!mysql_conn) {
proxy_error("Static_Harvester: mysql_init failed\n");
pthread_mutex_unlock(&conn_lock);
return -1;
}
// Set timeouts
unsigned int timeout = 30;
mysql_options(mysql_conn, MYSQL_OPT_CONNECT_TIMEOUT, &timeout);
mysql_options(mysql_conn, MYSQL_OPT_READ_TIMEOUT, &timeout);
mysql_options(mysql_conn, MYSQL_OPT_WRITE_TIMEOUT, &timeout);
// Connect
if (!mysql_real_connect(
mysql_conn,
mysql_host.c_str(),
mysql_user.c_str(),
mysql_password.c_str(),
NULL, // No default schema - we query information_schema
mysql_port,
NULL,
CLIENT_MULTI_STATEMENTS
)) {
proxy_error("Static_Harvester: mysql_real_connect failed: %s\n", mysql_error(mysql_conn));
mysql_close(mysql_conn);
mysql_conn = NULL;
pthread_mutex_unlock(&conn_lock);
return -1;
}
// Get MySQL version
mysql_version = get_mysql_version();
source_dsn = "mysql://" + mysql_user + "@" + mysql_host + ":" + std::to_string(mysql_port) + "/" + mysql_schema;
proxy_info("Static_Harvester: Connected to MySQL %s at %s:%d\n",
mysql_version.c_str(), mysql_host.c_str(), mysql_port);
pthread_mutex_unlock(&conn_lock);
return 0;
}
// Disconnect from the MySQL server.
//
// Closes the MySQL connection if connected. Safe to call when
// not connected (idempotent).
//
// Thread Safety:
// Uses mutex to ensure thread-safe disconnection.
void Static_Harvester::disconnect_mysql() {
pthread_mutex_lock(&conn_lock);
if (mysql_conn) {
mysql_close(mysql_conn);
mysql_conn = NULL;
}
pthread_mutex_unlock(&conn_lock);
}
// Get the MySQL server version string.
//
// Retrieves the version from the connected MySQL server.
// Used for recording metadata in the discovery run.
//
// Returns:
// MySQL version string (e.g., "8.0.35"), or empty string if not connected
std::string Static_Harvester::get_mysql_version() {
if (!mysql_conn) {
return "";
}
MYSQL_RES* result = mysql_list_tables(mysql_conn, NULL);
if (!result) {
return mysql_get_server_info(mysql_conn);
}
mysql_free_result(result);
return mysql_get_server_info(mysql_conn);
}
// Execute a SQL query on the MySQL server and return results.
//
// Executes the query and returns all result rows as a vector of string vectors.
// NULL values are converted to empty strings.
//
// Parameters:
// query - SQL query string to execute
// results - Output parameter populated with result rows
//
// Returns:
// 0 on success (including queries with no result set), -1 on error
//
// Thread Safety:
// Uses mutex to ensure thread-safe query execution.
int Static_Harvester::execute_query(const std::string& query, std::vector<std::vector<std::string>>& results) {
pthread_mutex_lock(&conn_lock);
if (!mysql_conn) {
pthread_mutex_unlock(&conn_lock);
proxy_error("Static_Harvester: Not connected to MySQL\n");
return -1;
}
proxy_debug(PROXY_DEBUG_GENERIC, 3, "Static_Harvester: Executing query: %s\n", query.c_str());
if (mysql_query(mysql_conn, query.c_str())) {
proxy_error("Static_Harvester: Query failed: %s\n", mysql_error(mysql_conn));
pthread_mutex_unlock(&conn_lock);
return -1;
}
MYSQL_RES* res = mysql_store_result(mysql_conn);
if (!res) {
// No result set (e.g., INSERT/UPDATE)
pthread_mutex_unlock(&conn_lock);
return 0;
}
int num_fields = mysql_num_fields(res);
MYSQL_ROW row;
while ((row = mysql_fetch_row(res))) {
std::vector<std::string> row_data;
for (int i = 0; i < num_fields; i++) {
row_data.push_back(row[i] ? row[i] : "");
}
results.push_back(row_data);
}
mysql_free_result(res);
pthread_mutex_unlock(&conn_lock);
return 0;
}
// ============================================================
// Helper Methods
// ============================================================
// Check if a data type is a temporal/time type.
//
// Used to mark columns with is_time=1 for time-based analysis.
//
// Parameters:
// data_type - MySQL data type string (e.g., "DATETIME", "VARCHAR")
//
// Returns:
// true if the type is date, datetime, timestamp, time, or year; false otherwise
bool Static_Harvester::is_time_type(const std::string& data_type) {
std::string dt = data_type;
std::transform(dt.begin(), dt.end(), dt.begin(), ::tolower);
return dt == "date" || dt == "datetime" || dt == "timestamp" ||
dt == "time" || dt == "year";
}
// Check if a column name appears to be an identifier/ID column.
//
// Used to mark columns with is_id_like=1 for relationship inference.
// Column names ending with "_id" or exactly "id" are considered ID-like.
//
// Parameters:
// column_name - Column name to check
//
// Returns:
// true if the column name ends with "_id" or is exactly "id"; false otherwise
bool Static_Harvester::is_id_like_name(const std::string& column_name) {
std::string cn = column_name;
std::transform(cn.begin(), cn.end(), cn.begin(), ::tolower);
// Check if name ends with '_id' or is exactly 'id'
if (cn == "id") return true;
if (cn.length() > 3 && cn.substr(cn.length() - 3) == "_id") return true;
return false;
}
// Validate a schema/database name for safe use in SQL queries.
//
// MySQL schema names should only contain alphanumeric characters, underscores,
// and dollar signs. This validation prevents SQL injection when the schema
// name is used in string concatenation for INFORMATION_SCHEMA queries.
//
// Parameters:
// name - Schema name to validate
//
// Returns:
// true if the name is safe to use, false otherwise
bool Static_Harvester::is_valid_schema_name(const std::string& name) {
if (name.empty()) {
return true; // Empty filter is valid (means "all schemas")
}
// Schema names should only contain alphanumeric, underscore, and dollar sign
for (char c : name) {
if (!isalnum(c) && c != '_' && c != '$') {
return false;
}
}
return true;
}
// Escape a string for safe use in SQL queries by doubling single quotes.
//
// This is a simple SQL escaping function that prevents SQL injection
// when strings are used in string concatenation for SQL queries.
//
// Parameters:
// str - String to escape
//
// Returns:
// Escaped string with single quotes doubled
std::string Static_Harvester::escape_sql_string(const std::string& str) {
std::string escaped;
escaped.reserve(str.length() * 2); // Reserve space for potential escaping
for (char c : str) {
if (c == '\'') {
escaped += "''"; // Escape single quote by doubling
} else {
escaped += c;
}
}
return escaped;
}
// ============================================================
// Discovery Run Management
// ============================================================
// Start a new discovery run.
//
// Creates a new run entry in the catalog and stores the run_id.
// All subsequent harvest operations will be associated with this run.
//
// Parameters:
// notes - Optional notes/description for this run
//
// Returns:
// run_id on success, -1 on error (including if a run is already active)
//
// Notes:
// - Only one run can be active at a time per harvester instance
// - Automatically connects to MySQL if not already connected
// - Records source DSN and MySQL version in the run metadata
int Static_Harvester::start_run(const std::string& target_id, const std::string& notes) {
if (current_run_id >= 0) {
proxy_error("Static_Harvester: Run already active (run_id=%d)\n", current_run_id);
return -1;
}
if (connect_mysql()) {
return -1;
}
current_run_id = catalog->create_run(target_id, "mysql", source_dsn, mysql_version, notes);
if (current_run_id < 0) {
proxy_error("Static_Harvester: Failed to create run\n");
return -1;
}
proxy_info("Static_Harvester: Started run_id=%d\n", current_run_id);
return current_run_id;
}
// Finish the current discovery run.
//
// Marks the run as completed in the catalog with a finish timestamp
// and optional completion notes. Resets current_run_id to -1.
//
// Parameters:
// notes - Optional completion notes (e.g., "Completed successfully", "Failed at stage X")
//
// Returns:
// 0 on success, -1 on error (including if no run is active)
int Static_Harvester::finish_run(const std::string& notes) {
if (current_run_id < 0) {
proxy_error("Static_Harvester: No active run\n");
return -1;
}
int rc = catalog->finish_run(current_run_id, notes);
if (rc) {
proxy_error("Static_Harvester: Failed to finish run\n");
return -1;
}
proxy_info("Static_Harvester: Finished run_id=%d\n", current_run_id);
current_run_id = -1;
return 0;
}
// ============================================================
// Fetch Methods (Query INFORMATION_SCHEMA)
// ============================================================
// Fetch schema/database metadata from information_schema.SCHEMATA.
//
// Queries MySQL for all schemas (databases) and their character set
// and collation information.
//
// Parameters:
// filter - Optional schema name filter (empty for all schemas)
//
// Returns:
// Vector of SchemaRow structures containing schema metadata
std::vector<Static_Harvester::SchemaRow> Static_Harvester::fetch_schemas(const std::string& filter) {
std::vector<SchemaRow> schemas;
// Validate schema name to prevent SQL injection
if (!is_valid_schema_name(filter)) {
proxy_error("Static_Harvester: Invalid schema name '%s'\n", filter.c_str());
return schemas;
}
std::ostringstream sql;
sql << "SELECT SCHEMA_NAME, DEFAULT_CHARACTER_SET_NAME, DEFAULT_COLLATION_NAME "
<< "FROM information_schema.SCHEMATA";
if (!filter.empty()) {
sql << " WHERE SCHEMA_NAME = '" << filter << "'";
}
sql << " ORDER BY SCHEMA_NAME;";
std::vector<std::vector<std::string>> results;
if (execute_query(sql.str(), results) == 0) {
for (const auto& row : results) {
SchemaRow s;
s.schema_name = row[0];
s.charset = row[1];
s.collation = row[2];
schemas.push_back(s);
}
}
return schemas;
}
// ============================================================
// Harvest Stage Methods
// ============================================================
// Harvest schemas/databases to the catalog.
//
// Fetches schemas from information_schema.SCHEMATA and inserts them
// into the catalog. System schemas (mysql, information_schema,
// performance_schema, sys) are skipped.
//
// Parameters:
// only_schema - Optional filter to harvest only one schema
//
// Returns:
// Number of schemas harvested, or -1 on error
//
// Notes:
// - Requires an active run (start_run must be called first)
// - Skips system schemas automatically
int Static_Harvester::harvest_schemas(const std::string& only_schema) {
if (current_run_id < 0) {
proxy_error("Static_Harvester: No active run\n");
return -1;
}
std::vector<SchemaRow> schemas = fetch_schemas(only_schema);
int count = 0;
for (const auto& s : schemas) {
// Skip system schemas
if (s.schema_name == "mysql" || s.schema_name == "information_schema" ||
s.schema_name == "performance_schema" || s.schema_name == "sys") {
continue;
}
if (catalog->insert_schema(current_run_id, s.schema_name, s.charset, s.collation) >= 0) {
count++;
}
}
proxy_info("Static_Harvester: Harvested %d schemas\n", count);
return count;
}
// Fetch table and view metadata from information_schema.TABLES.
//
// Queries MySQL for all tables and views with their physical
// characteristics (rows, size, engine, timestamps).
//
// Parameters:
// filter - Optional schema name filter
//
// Returns:
// Vector of ObjectRow structures containing table/view metadata
std::vector<Static_Harvester::ObjectRow> Static_Harvester::fetch_tables_views(const std::string& filter) {
std::vector<ObjectRow> objects;
// Validate schema name to prevent SQL injection
if (!is_valid_schema_name(filter)) {
proxy_error("Static_Harvester: Invalid schema name '%s'\n", filter.c_str());
return objects;
}
std::ostringstream sql;
sql << "SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE, ENGINE, TABLE_ROWS, "
<< "DATA_LENGTH, INDEX_LENGTH, CREATE_TIME, UPDATE_TIME, TABLE_COMMENT "
<< "FROM information_schema.TABLES "
<< "WHERE TABLE_SCHEMA NOT IN ('mysql','information_schema','performance_schema','sys')";
if (!filter.empty()) {
sql << " AND TABLE_SCHEMA = '" << filter << "'";
}
sql << " ORDER BY TABLE_SCHEMA, TABLE_NAME;";
std::vector<std::vector<std::string>> results;
if (execute_query(sql.str(), results) == 0) {
for (const auto& row : results) {
ObjectRow o;
o.schema_name = row[0];
o.object_name = row[1];
o.object_type = (row[2] == "VIEW") ? "view" : "table";
o.engine = row[3];
o.table_rows_est = row[4].empty() ? 0 : atol(row[4].c_str());
o.data_length = row[5].empty() ? 0 : atol(row[5].c_str());
o.index_length = row[6].empty() ? 0 : atol(row[6].c_str());
o.create_time = row[7];
o.update_time = row[8];
o.object_comment = row[9];
objects.push_back(o);
}
}
return objects;
}
// Fetch column metadata from information_schema.COLUMNS.
//
// Queries MySQL for all columns with their data types, nullability,
// defaults, character set, and comments.
//
// Parameters:
// filter - Optional schema name filter
//
// Returns:
// Vector of ColumnRow structures containing column metadata
std::vector<Static_Harvester::ColumnRow> Static_Harvester::fetch_columns(const std::string& filter) {
std::vector<ColumnRow> columns;
// Validate schema name to prevent SQL injection
if (!is_valid_schema_name(filter)) {
proxy_error("Static_Harvester: Invalid schema name '%s'\n", filter.c_str());
return columns;
}
std::ostringstream sql;
sql << "SELECT TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION, COLUMN_NAME, "
<< "DATA_TYPE, COLUMN_TYPE, IS_NULLABLE, COLUMN_DEFAULT, EXTRA, "
<< "CHARACTER_SET_NAME, COLLATION_NAME, COLUMN_COMMENT "
<< "FROM information_schema.COLUMNS "
<< "WHERE TABLE_SCHEMA NOT IN ('mysql','information_schema','performance_schema','sys')";
if (!filter.empty()) {
sql << " AND TABLE_SCHEMA = '" << filter << "'";
}
sql << " ORDER BY TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION;";
std::vector<std::vector<std::string>> results;
if (execute_query(sql.str(), results) == 0) {
for (const auto& row : results) {
ColumnRow c;
c.schema_name = row[0];
c.object_name = row[1];
c.ordinal_pos = atoi(row[2].c_str());
c.column_name = row[3];
c.data_type = row[4];
c.column_type = row[5];
c.is_nullable = (row[6] == "YES") ? 1 : 0;
c.column_default = row[7];
c.extra = row[8];
c.charset = row[9];
c.collation = row[10];
c.column_comment = row[11];
columns.push_back(c);
}
}
return columns;
}
// Fetch index metadata from information_schema.STATISTICS.
//
// Queries MySQL for all indexes with their columns, sequence,
// uniqueness, cardinality, and collation.
//
// Parameters:
// filter - Optional schema name filter
//
// Returns:
// Vector of IndexRow structures containing index metadata
std::vector<Static_Harvester::IndexRow> Static_Harvester::fetch_indexes(const std::string& filter) {
std::vector<IndexRow> indexes;
// Validate schema name to prevent SQL injection
if (!is_valid_schema_name(filter)) {
proxy_error("Static_Harvester: Invalid schema name '%s'\n", filter.c_str());
return indexes;
}
std::ostringstream sql;
sql << "SELECT TABLE_SCHEMA, TABLE_NAME, INDEX_NAME, NON_UNIQUE, INDEX_TYPE, "
<< "SEQ_IN_INDEX, COLUMN_NAME, SUB_PART, COLLATION, CARDINALITY "
<< "FROM information_schema.STATISTICS "
<< "WHERE TABLE_SCHEMA NOT IN ('mysql','information_schema','performance_schema','sys')";
if (!filter.empty()) {
sql << " AND TABLE_SCHEMA = '" << filter << "'";
}
sql << " ORDER BY TABLE_SCHEMA, TABLE_NAME, INDEX_NAME, SEQ_IN_INDEX;";
std::vector<std::vector<std::string>> results;
if (execute_query(sql.str(), results) == 0) {
for (const auto& row : results) {
IndexRow i;
i.schema_name = row[0];
i.object_name = row[1];
i.index_name = row[2];
i.is_unique = (row[3] == "0") ? 1 : 0;
i.index_type = row[4];
i.seq_in_index = atoi(row[5].c_str());
i.column_name = row[6];
i.sub_part = row[7].empty() ? 0 : atoi(row[7].c_str());
i.collation = row[8];
i.cardinality = row[9].empty() ? 0 : atol(row[9].c_str());
indexes.push_back(i);
}
}
return indexes;
}
// Fetch foreign key metadata from information_schema.
//
// Queries KEY_COLUMN_USAGE and REFERENTIAL_CONSTRAINTS to get
// foreign key relationships including child/parent tables and columns,
// and ON UPDATE/DELETE rules.
//
// Parameters:
// filter - Optional schema name filter
//
// Returns:
// Vector of FKRow structures containing foreign key metadata
std::vector<Static_Harvester::FKRow> Static_Harvester::fetch_foreign_keys(const std::string& filter) {
std::vector<FKRow> fks;
// Validate schema name to prevent SQL injection
if (!is_valid_schema_name(filter)) {
proxy_error("Static_Harvester: Invalid schema name '%s'\n", filter.c_str());
return fks;
}
std::ostringstream sql;
sql << "SELECT kcu.CONSTRAINT_SCHEMA AS child_schema, "
<< "kcu.TABLE_NAME AS child_table, kcu.CONSTRAINT_NAME AS fk_name, "
<< "kcu.COLUMN_NAME AS child_column, kcu.REFERENCED_TABLE_SCHEMA AS parent_schema, "
<< "kcu.REFERENCED_TABLE_NAME AS parent_table, kcu.REFERENCED_COLUMN_NAME AS parent_column, "
<< "kcu.ORDINAL_POSITION AS seq, rc.UPDATE_RULE AS on_update, rc.DELETE_RULE AS on_delete "
<< "FROM information_schema.KEY_COLUMN_USAGE kcu "
<< "JOIN information_schema.REFERENTIAL_CONSTRAINTS rc "
<< " ON rc.CONSTRAINT_SCHEMA = kcu.CONSTRAINT_SCHEMA "
<< " AND rc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME "
<< "WHERE kcu.TABLE_SCHEMA NOT IN ('mysql','information_schema','performance_schema','sys')";
if (!filter.empty()) {
sql << " AND kcu.TABLE_SCHEMA = '" << filter << "'";
}
sql << " AND kcu.REFERENCED_TABLE_NAME IS NOT NULL "
<< "ORDER BY child_schema, child_table, fk_name, seq;";
std::vector<std::vector<std::string>> results;
if (execute_query(sql.str(), results) == 0) {
for (const auto& row : results) {
FKRow fk;
fk.child_schema = row[0];
fk.child_table = row[1];
fk.fk_name = row[2];
fk.child_column = row[3];
fk.parent_schema = row[4];
fk.parent_table = row[5];
fk.parent_column = row[6];
fk.seq = atoi(row[7].c_str());
fk.on_update = row[8];
fk.on_delete = row[9];
fks.push_back(fk);
}
}
return fks;
}
// Harvest objects (tables, views, routines) to the catalog.
//
// Fetches tables/views from information_schema.TABLES and routines
// from information_schema.ROUTINES, inserting them all into the catalog.
//
// Parameters:
// only_schema - Optional filter to harvest only one schema
//
// Returns:
// Number of objects harvested, or -1 on error
int Static_Harvester::harvest_objects(const std::string& only_schema) {
if (current_run_id < 0) {
proxy_error("Static_Harvester: No active run\n");
return -1;
}
// Fetch tables and views
std::vector<ObjectRow> objects = fetch_tables_views(only_schema);
int count = 0;
for (const auto& o : objects) {
int object_id = catalog->insert_object(
current_run_id, o.schema_name, o.object_name, o.object_type,
o.engine, o.table_rows_est, o.data_length, o.index_length,
o.create_time, o.update_time, o.object_comment, ""
);
if (object_id >= 0) {
count++;
}
}
// Fetch and insert routines (stored procedures/functions)
std::ostringstream sql;
sql << "SELECT ROUTINE_SCHEMA, ROUTINE_NAME, ROUTINE_TYPE, ROUTINE_COMMENT "
<< "FROM information_schema.ROUTINES "
<< "WHERE ROUTINE_SCHEMA NOT IN ('mysql','information_schema','performance_schema','sys')";
if (!only_schema.empty()) {
sql << " AND ROUTINE_SCHEMA = '" << only_schema << "'";
}
sql << " ORDER BY ROUTINE_SCHEMA, ROUTINE_NAME;";
std::vector<std::vector<std::string>> results;
if (execute_query(sql.str(), results) == 0) {
for (const auto& row : results) {
int object_id = catalog->insert_object(
current_run_id, row[0], row[1], "routine",
"", 0, 0, 0, "", "", row[3], ""
);
if (object_id >= 0) {
count++;
}
}
}
proxy_info("Static_Harvester: Harvested %d objects\n", count);
return count;
}
// Harvest columns to the catalog with derived hints.
//
// Fetches columns from information_schema.COLUMNS and computes
// derived flags: is_time (temporal types) and is_id_like (ID-like names).
// Updates object flags after all columns are inserted.
//
// Parameters:
// only_schema - Optional filter to harvest only one schema
//
// Returns:
// Number of columns harvested, or -1 on error
//
// Notes:
// - Updates object flags (has_time_column) after harvest
int Static_Harvester::harvest_columns(const std::string& only_schema) {
if (current_run_id < 0) {
proxy_error("Static_Harvester: No active run\n");
return -1;
}
std::vector<ColumnRow> columns = fetch_columns(only_schema);
int count = 0;
for (const auto& c : columns) {
// Find the object_id for this column
std::string object_key = c.schema_name + "." + c.object_name;
// Query catalog to get object_id
char* error = NULL;
int cols = 0, affected = 0;
SQLite3_result* resultset = NULL;
std::ostringstream sql;
sql << "SELECT object_id FROM objects "
<< "WHERE run_id = " << current_run_id
<< " AND schema_name = '" << c.schema_name << "'"
<< " AND object_name = '" << c.object_name << "'"
<< " AND object_type IN ('table', 'view') LIMIT 1;";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected, &resultset);
if (!resultset || resultset->rows.empty()) {
delete resultset;
continue; // Object not found
}
int object_id = atoi(resultset->rows[0]->fields[0]);
delete resultset;
// Compute derived flags
int is_time = is_time_type(c.data_type) ? 1 : 0;
int is_id_like = is_id_like_name(c.column_name) ? 1 : 0;
if (catalog->insert_column(
object_id, c.ordinal_pos, c.column_name, c.data_type,
c.column_type, c.is_nullable, c.column_default, c.extra,
c.charset, c.collation, c.column_comment,
0, 0, 0, is_time, is_id_like
) >= 0) {
count++;
}
}
// Update object flags
catalog->update_object_flags(current_run_id);
proxy_info("Static_Harvester: Harvested %d columns\n", count);
return count;
}
// Harvest indexes to the catalog and update column flags.
//
// Fetches indexes from information_schema.STATISTICS and inserts
// them with their columns. Updates column flags (is_pk, is_unique,
// is_indexed) and object flags (has_primary_key) after harvest.
//
// Parameters:
// only_schema - Optional filter to harvest only one schema
//
// Returns:
// Number of indexes harvested, or -1 on error
//
// Notes:
// - Groups index columns by index name
// - Marks PRIMARY KEY indexes with is_primary=1
// - Updates column and object flags after harvest
int Static_Harvester::harvest_indexes(const std::string& only_schema) {
if (current_run_id < 0) {
proxy_error("Static_Harvester: No active run\n");
return -1;
}
std::vector<IndexRow> indexes = fetch_indexes(only_schema);
// Group by index
std::map<std::string, std::vector<IndexRow>> index_map;
for (const auto& i : indexes) {
std::string key = i.schema_name + "." + i.object_name + "." + i.index_name;
index_map[key].push_back(i);
}
int count = 0;
for (const auto& entry : index_map) {
const auto& idx_rows = entry.second;
if (idx_rows.empty()) continue;
const IndexRow& first = idx_rows[0];
// Get object_id
char* error = NULL;
int cols = 0, affected = 0;
SQLite3_result* resultset = NULL;
std::ostringstream sql;
sql << "SELECT object_id FROM objects "
<< "WHERE run_id = " << current_run_id
<< " AND schema_name = '" << first.schema_name << "'"
<< " AND object_name = '" << first.object_name << "'"
<< " AND object_type = 'table' LIMIT 1;";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected, &resultset);
if (!resultset || resultset->rows.empty()) {
delete resultset;
continue;
}
int object_id = atoi(resultset->rows[0]->fields[0]);
delete resultset;
// Check if this is the primary key
int is_primary = (first.index_name == "PRIMARY") ? 1 : 0;
// Insert index
int index_id = catalog->insert_index(
object_id, first.index_name, first.is_unique, is_primary,
first.index_type, first.cardinality
);
if (index_id < 0) continue;
// Insert index columns
for (const auto& idx_row : idx_rows) {
catalog->insert_index_column(
index_id, idx_row.seq_in_index, idx_row.column_name,
idx_row.sub_part, idx_row.collation
);
}
count++;
}
// Update column is_pk, is_unique, is_indexed flags
char* error = NULL;
int cols, affected;
std::ostringstream sql;
// Mark indexed columns
sql << "UPDATE columns SET is_indexed = 1 "
<< "WHERE object_id IN (SELECT object_id FROM objects WHERE run_id = " << current_run_id << ") "
<< "AND (object_id, column_name) IN ("
<< " SELECT i.object_id, ic.column_name FROM indexes i JOIN index_columns ic ON i.index_id = ic.index_id"
<< ");";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected);
// Mark PK columns
sql.str("");
sql << "UPDATE columns SET is_pk = 1 "
<< "WHERE object_id IN (SELECT object_id FROM objects WHERE run_id = " << current_run_id << ") "
<< "AND (object_id, column_name) IN ("
<< " SELECT i.object_id, ic.column_name FROM indexes i JOIN index_columns ic ON i.index_id = ic.index_id "
<< " WHERE i.is_primary = 1"
<< ");";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected);
// Mark unique columns (simplified - for single-column unique indexes)
sql.str("");
sql << "UPDATE columns SET is_unique = 1 "
<< "WHERE object_id IN (SELECT object_id FROM objects WHERE run_id = " << current_run_id << ") "
<< "AND (object_id, column_name) IN ("
<< " SELECT i.object_id, ic.column_name FROM indexes i JOIN index_columns ic ON i.index_id = ic.index_id "
<< " WHERE i.is_unique = 1 AND i.is_primary = 0 "
<< " GROUP BY i.object_id, ic.column_name HAVING COUNT(*) = 1"
<< ");";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected);
// Update object has_primary_key flag
catalog->update_object_flags(current_run_id);
proxy_info("Static_Harvester: Harvested %d indexes\n", count);
return count;
}
// Harvest foreign keys to the catalog.
//
// Fetches foreign keys from information_schema and inserts them
// with their child/parent column mappings. Updates object flags
// (has_foreign_keys) after harvest.
//
// Parameters:
// only_schema - Optional filter to harvest only one schema
//
// Returns:
// Number of foreign keys harvested, or -1 on error
//
// Notes:
// - Groups FK columns by constraint name
// - Updates object flags after harvest
int Static_Harvester::harvest_foreign_keys(const std::string& only_schema) {
if (current_run_id < 0) {
proxy_error("Static_Harvester: No active run\n");
return -1;
}
std::vector<FKRow> fks = fetch_foreign_keys(only_schema);
// Group by FK
std::map<std::string, std::vector<FKRow>> fk_map;
for (const auto& fk : fks) {
std::string key = fk.child_schema + "." + fk.child_table + "." + fk.fk_name;
fk_map[key].push_back(fk);
}
int count = 0;
for (const auto& entry : fk_map) {
const auto& fk_rows = entry.second;
if (fk_rows.empty()) continue;
const FKRow& first = fk_rows[0];
// Get child object_id
char* error = NULL;
int cols = 0, affected = 0;
SQLite3_result* resultset = NULL;
std::ostringstream sql;
sql << "SELECT object_id FROM objects "
<< "WHERE run_id = " << current_run_id
<< " AND schema_name = '" << first.child_schema << "'"
<< " AND object_name = '" << first.child_table << "'"
<< " AND object_type = 'table' LIMIT 1;";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected, &resultset);
if (!resultset || resultset->rows.empty()) {
delete resultset;
continue;
}
int child_object_id = atoi(resultset->rows[0]->fields[0]);
delete resultset;
// Insert FK
int fk_id = catalog->insert_foreign_key(
current_run_id, child_object_id, first.fk_name,
first.parent_schema, first.parent_table,
first.on_update, first.on_delete
);
if (fk_id < 0) continue;
// Insert FK columns
for (const auto& fk_row : fk_rows) {
catalog->insert_foreign_key_column(
fk_id, fk_row.seq, fk_row.child_column, fk_row.parent_column
);
}
count++;
}
// Update object has_foreign_keys flag
catalog->update_object_flags(current_run_id);
proxy_info("Static_Harvester: Harvested %d foreign keys\n", count);
return count;
}
// Harvest view definitions to the catalog.
//
// Fetches VIEW_DEFINITION from information_schema.VIEWS and stores
// it in the object's definition_sql field.
//
// Parameters:
// only_schema - Optional filter to harvest only one schema
//
// Returns:
// Number of views updated, or -1 on error
int Static_Harvester::harvest_view_definitions(const std::string& only_schema) {
if (current_run_id < 0) {
proxy_error("Static_Harvester: No active run\n");
return -1;
}
std::ostringstream sql;
sql << "SELECT TABLE_SCHEMA, TABLE_NAME, VIEW_DEFINITION "
<< "FROM information_schema.VIEWS "
<< "WHERE TABLE_SCHEMA NOT IN ('mysql','information_schema','performance_schema','sys')";
if (!only_schema.empty()) {
sql << " AND TABLE_SCHEMA = '" << only_schema << "'";
}
sql << ";";
std::vector<std::vector<std::string>> results;
if (execute_query(sql.str(), results) != 0) {
return -1;
}
int count = 0;
for (const auto& row : results) {
std::string schema_name = row[0];
std::string view_name = row[1];
std::string view_def = row[2];
// Update object with definition
char* error = NULL;
int cols = 0, affected = 0;
std::ostringstream update_sql;
update_sql << "UPDATE objects SET definition_sql = '" << escape_sql_string(view_def) << "' "
<< "WHERE run_id = " << current_run_id
<< " AND schema_name = '" << escape_sql_string(schema_name) << "'"
<< " AND object_name = '" << escape_sql_string(view_name) << "'"
<< " AND object_type = 'view';";
catalog->get_db()->execute_statement(update_sql.str().c_str(), &error, &cols, &affected);
if (affected > 0) {
count++;
}
}
proxy_info("Static_Harvester: Updated %d view definitions\n", count);
return count;
}
// Build quick profiles (metadata-only table analysis).
//
// Analyzes table metadata to derive:
// - guessed_kind: log/event, fact, entity, or unknown (based on table name)
// - rows_est, size_bytes, engine: from object metadata
// - has_primary_key, has_foreign_keys, has_time_column: boolean flags
//
// Stores the profile as JSON with profile_kind='table_quick'.
//
// Returns:
// Number of profiles built, or -1 on error
//
// Table Kind Heuristics:
// - log/event: name contains "log", "event", or "audit"
// - fact: name contains "order", "invoice", "payment", or "transaction"
// - entity: name contains "user", "customer", "account", or "product"
// - unknown: none of the above patterns match
int Static_Harvester::build_quick_profiles() {
if (current_run_id < 0) {
proxy_error("Static_Harvester: No active run\n");
return -1;
}
char* error = NULL;
int cols = 0, affected = 0;
SQLite3_result* resultset = NULL;
std::ostringstream sql;
sql << "SELECT object_id, schema_name, object_name, object_type, engine, table_rows_est, "
<< "data_length, index_length, has_primary_key, has_foreign_keys, has_time_column "
<< "FROM objects WHERE run_id = " << current_run_id
<< " AND object_type IN ('table', 'view')";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected, &resultset);
if (!resultset) {
return -1;
}
int count = 0;
for (std::vector<SQLite3_row*>::iterator it = resultset->rows.begin();
it != resultset->rows.end(); ++it) {
SQLite3_row* row = *it;
int object_id = atoi(row->fields[0]);
std::string object_name = std::string(row->fields[2] ? row->fields[2] : "");
// Guess kind from name
std::string guessed_kind = "unknown";
std::string name_lower = object_name;
std::transform(name_lower.begin(), name_lower.end(), name_lower.begin(), ::tolower);
if (name_lower.find("log") != std::string::npos ||
name_lower.find("event") != std::string::npos ||
name_lower.find("audit") != std::string::npos) {
guessed_kind = "log/event";
} else if (name_lower.find("order") != std::string::npos ||
name_lower.find("invoice") != std::string::npos ||
name_lower.find("payment") != std::string::npos ||
name_lower.find("transaction") != std::string::npos) {
guessed_kind = "fact";
} else if (name_lower.find("user") != std::string::npos ||
name_lower.find("customer") != std::string::npos ||
name_lower.find("account") != std::string::npos ||
name_lower.find("product") != std::string::npos) {
guessed_kind = "entity";
}
// Build profile JSON
json profile;
profile["guessed_kind"] = guessed_kind;
// SELECT: object_id(0), schema_name(1), object_name(2), object_type(3), engine(4), table_rows_est(5), data_length(6), index_length(7), has_primary_key(8), has_foreign_keys(9), has_time_column(10)
profile["rows_est"] = row->fields[5] ? atol(row->fields[5]) : 0;
profile["size_bytes"] = (atol(row->fields[6] ? row->fields[6] : "0") +
atol(row->fields[7] ? row->fields[7] : "0"));
profile["engine"] = std::string(row->fields[4] ? row->fields[4] : "");
profile["has_primary_key"] = atoi(row->fields[8]) != 0;
profile["has_foreign_keys"] = atoi(row->fields[9]) != 0;
profile["has_time_column"] = atoi(row->fields[10]) != 0;
if (catalog->upsert_profile(current_run_id, object_id, "table_quick", profile.dump()) == 0) {
count++;
}
}
delete resultset;
proxy_info("Static_Harvester: Built %d quick profiles\n", count);
return count;
}
// Rebuild the full-text search index for the current run.
//
// Deletes and rebuilds the fts_objects FTS5 index, enabling fast
// full-text search across object names, schemas, and comments.
//
// Returns:
// 0 on success, -1 on error
int Static_Harvester::rebuild_fts_index() {
if (current_run_id < 0) {
proxy_error("Static_Harvester: No active run\n");
return -1;
}
int rc = catalog->rebuild_fts_index(current_run_id);
if (rc) {
proxy_error("Static_Harvester: Failed to rebuild FTS index\n");
return -1;
}
proxy_info("Static_Harvester: Rebuilt FTS index\n");
return 0;
}
// Run a complete harvest of all metadata stages.
//
// Executes all harvest stages in order:
// 1. Start discovery run
// 2. Harvest schemas/databases
// 3. Harvest objects (tables, views, routines)
// 4. Harvest columns with derived hints
// 5. Harvest indexes and update column flags
// 6. Harvest foreign keys
// 7. Harvest view definitions
// 8. Build quick profiles
// 9. Rebuild FTS index
// 10. Finish run
//
// If any stage fails, the run is finished with an error note.
//
// Parameters:
// only_schema - Optional filter to harvest only one schema
// notes - Optional notes for the run
//
// Returns:
// run_id on success, -1 on error
int Static_Harvester::run_full_harvest(const std::string& target_id, const std::string& only_schema, const std::string& notes) {
if (start_run(target_id, notes) < 0) {
return -1;
}
if (harvest_schemas(only_schema) < 0) {
finish_run("Failed during schema harvest");
return -1;
}
if (harvest_objects(only_schema) < 0) {
finish_run("Failed during object harvest");
return -1;
}
if (harvest_columns(only_schema) < 0) {
finish_run("Failed during column harvest");
return -1;
}
if (harvest_indexes(only_schema) < 0) {
finish_run("Failed during index harvest");
return -1;
}
if (harvest_foreign_keys(only_schema) < 0) {
finish_run("Failed during foreign key harvest");
return -1;
}
if (harvest_view_definitions(only_schema) < 0) {
finish_run("Failed during view definition harvest");
return -1;
}
if (build_quick_profiles() < 0) {
finish_run("Failed during profile building");
return -1;
}
if (rebuild_fts_index() < 0) {
finish_run("Failed during FTS rebuild");
return -1;
}
int final_run_id = current_run_id;
finish_run("Harvest completed successfully");
return final_run_id;
}
// ============================================================
// Statistics Methods
// ============================================================
// Get harvest statistics for the current run.
//
// Returns statistics including counts of objects (by type),
// columns, indexes, and foreign keys harvested in the
// currently active run.
//
// Returns:
// JSON string with harvest statistics, or error if no active run
std::string Static_Harvester::get_harvest_stats() {
if (current_run_id < 0) {
return "{\"error\": \"No active run\"}";
}
return get_harvest_stats(current_run_id);
}
// Get harvest statistics for a specific run.
//
// Queries the catalog for counts of objects (by type), columns,
// indexes, and foreign keys for the specified run_id.
//
// Parameters:
// run_id - The run ID to get statistics for
//
// Returns:
// JSON string with structure: {"run_id": N, "objects": {...}, "columns": N, "indexes": N, "foreign_keys": N}
std::string Static_Harvester::get_harvest_stats(int run_id) {
char* error = NULL;
int cols = 0, affected = 0;
SQLite3_result* resultset = NULL;
std::ostringstream sql;
json stats;
stats["run_id"] = run_id;
// Count objects
sql.str("");
sql << "SELECT object_type, COUNT(*) FROM objects WHERE run_id = " << run_id
<< " GROUP BY object_type;";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected, &resultset);
if (resultset) {
json obj_counts = json::object();
for (std::vector<SQLite3_row*>::iterator it = resultset->rows.begin();
it != resultset->rows.end(); ++it) {
obj_counts[(*it)->fields[0]] = atol((*it)->fields[1]);
}
stats["objects"] = obj_counts;
delete resultset;
resultset = NULL;
}
// Count columns
sql.str("");
sql << "SELECT COUNT(*) FROM columns c JOIN objects o ON c.object_id = o.object_id "
<< "WHERE o.run_id = " << run_id << ";";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected, &resultset);
if (resultset && !resultset->rows.empty()) {
stats["columns"] = atol(resultset->rows[0]->fields[0]);
delete resultset;
resultset = NULL;
}
// Count indexes
sql.str("");
sql << "SELECT COUNT(*) FROM indexes i JOIN objects o ON i.object_id = o.object_id "
<< "WHERE o.run_id = " << run_id << ";";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected, &resultset);
if (resultset && !resultset->rows.empty()) {
stats["indexes"] = atol(resultset->rows[0]->fields[0]);
delete resultset;
resultset = NULL;
}
// Count foreign keys
sql.str("");
sql << "SELECT COUNT(*) FROM foreign_keys WHERE run_id = " << run_id << ";";
catalog->get_db()->execute_statement(sql.str().c_str(), &error, &cols, &affected, &resultset);
if (resultset && !resultset->rows.empty()) {
stats["foreign_keys"] = atol(resultset->rows[0]->fields[0]);
delete resultset;
}
return stats.dump();
}
#endif /* PROXYSQLGENAI */