From da877c378be1b25eb0da701d0fad9d78172944f9 Mon Sep 17 00:00:00 2001 From: Rene Cannao Date: Mon, 16 Feb 2026 01:23:06 +0000 Subject: [PATCH] tsdb: align admin variable lifecycle, full prometheus ingestion, docs and tap coverage --- doc/tsdb/embedded_tsdb_architecture.md | 234 ++---------- doc/tsdb/embedded_tsdb_metrics_catalog.md | 45 +-- doc/tsdb/embedded_tsdb_overview.md | 201 ++-------- doc/tsdb/embedded_tsdb_quickstart.md | 154 ++------ doc/tsdb/embedded_tsdb_reference.md | 254 ++---------- doc/tsdb/embedded_tsdb_specs.md | 256 ++----------- doc/tsdb/ui_endpoints.md | 21 +- include/ProxySQL_Statistics.hpp | 8 +- include/proxysql_admin.h | 13 +- lib/Admin_FlushVariables.cpp | 111 ------ lib/ProxySQL_Admin.cpp | 107 +++++- lib/ProxySQL_Statistics.cpp | 361 +++++++++++++----- .../tap/tests/test_tsdb_admin_variables-t.cpp | 123 ++++++ 13 files changed, 677 insertions(+), 1211 deletions(-) create mode 100644 test/tap/tests/test_tsdb_admin_variables-t.cpp diff --git a/doc/tsdb/embedded_tsdb_architecture.md b/doc/tsdb/embedded_tsdb_architecture.md index 32b1964f9..7151b61c5 100644 --- a/doc/tsdb/embedded_tsdb_architecture.md +++ b/doc/tsdb/embedded_tsdb_architecture.md @@ -1,224 +1,38 @@ # TSDB Architecture -## System Design +## Runtime Components -The TSDB subsystem is an extension of the existing `ProxySQL_Statistics` module, leveraging its proven SQLite-based storage infrastructure. +- `ProxySQL_Admin` main loop triggers three TSDB schedulers: + - sampler (`tsdb_sampler_loop`) + - downsampler (`tsdb_downsample_metrics`) + - backend monitor (`tsdb_monitor_loop`) +- Data is stored in `statsdb_disk` SQLite tables. -``` -┌─────────────────────────────────────────────────────────────────────────────┐ -│ ProxySQL Core │ -├─────────────────────────────────────────────────────────────────────────────┤ -│ ┌──────────────────────────────────────────────────────────────────────┐ │ -│ │ ProxySQL_Statistics │ │ -│ │ (SQLite3DB *statsdb_disk) │ │ -│ ├──────────────────────────────────────────────────────────────────────┤ │ -│ │ │ │ -│ │ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ │ -│ │ │ tsdb_metrics │ │tsdb_metrics_hour│ │tsdb_backend_ │ │ │ -│ │ │ │ │ │ │ health │ │ │ -│ │ │ • timestamp │ │ • bucket │ │ • timestamp │ │ │ -│ │ │ • metric_name │ │ • metric_name │ │ • hostgroup │ │ │ -│ │ │ • labels (JSON) │ │ • labels (JSON) │ │ • hostname │ │ │ -│ │ │ • value (REAL) │ │ • avg_value │ │ • port │ │ │ -│ │ │ │ │ • max_value │ │ • probe_up │ │ │ -│ │ │ PRIMARY KEY: │ │ • min_value │ │ • connect_ms │ │ │ -│ │ │ (timestamp, │ │ • count │ │ │ │ │ -│ │ │ metric_name) │ │ │ │ PRIMARY KEY: │ │ │ -│ │ │ │ │ PRIMARY KEY: │ │ (timestamp, │ │ │ -│ │ │ WITHOUT ROWID │ │ (bucket, │ │ hostgroup, │ │ │ -│ │ │ │ │ metric_name) │ │ hostname, port)│ │ │ -│ │ │ │ │ │ │ │ │ │ -│ │ │ Retention: 7d │ │ Retention: 1y │ │ Retention: 7d │ │ │ -│ │ └─────────────────┘ └─────────────────┘ └─────────────────┘ │ │ -│ │ │ │ -│ └──────────────────────────────────────────────────────────────────────┘ │ -└─────────────────────────────────────────────────────────────────────────────┘ -``` +## Data Path -## Components +1. Sampler collects all metric families from `GloVars.prometheus_registry->Collect()`. +2. Samples are normalized and inserted into `tsdb_metrics`. +3. Hourly job aggregates into `tsdb_metrics_hour`. +4. Backend monitor probes servers from `runtime_mysql_servers` and stores in `tsdb_backend_health`. +5. Retention cleanup removes old raw/probe data using configured days. -### 1. Sampler Thread (`tsdb_sampler_loop()`) +## Metric Family Handling -**Purpose:** Collect metrics from Prometheus registry - -**Frequency:** Every `stats_tsdb_sample_interval` seconds - -**Process:** -1. Check if tsdb_sampler_timetoget() returns true -2. If GloVars.prometheus_registry exists: - - Collect all metric families - - For each metric, extract labels - - Insert into tsdb_metrics table -3. Call tsdb_downsample_metrics() - -### 2. Monitor Thread (`tsdb_monitor_loop()`) - -**Purpose:** Active TCP probes to backend servers - -**Frequency:** Every `stats_tsdb_monitor_interval` seconds - -**Process:** -1. Check if tsdb_monitor_timetoget() returns true -2. Query runtime_mysql_servers for active backends -3. For each backend: - - Create TCP socket - - Measure connect time - - Record success/failure - - Insert into tsdb_backend_health - -### 3. Compactor (`tsdb_downsample_metrics()`) - -**Purpose:** Automatic downsampling and retention enforcement - -**Frequency:** Every hour (triggered during sampler loop) - -**Process:** -1. Get MAX(bucket) from tsdb_metrics_hour -2. For each hour bucket not yet processed: - - Aggregate raw data (AVG, MAX, MIN, COUNT) - - Insert into tsdb_metrics_hour -3. Delete raw data older than 7 days -4. Delete hourly data older than 1 year - -**SQL Operations:** -```sql --- Downsample raw data to hourly aggregates -INSERT OR REPLACE INTO tsdb_metrics_hour -SELECT - (timestamp/3600)*3600 as bucket, - metric_name, - labels, - AVG(value) as avg_value, - MAX(value) as max_value, - MIN(value) as min_value, - COUNT(*) as count -FROM tsdb_metrics -WHERE timestamp >= ? AND timestamp < ? -GROUP BY bucket, metric_name, labels; - --- Enforce retention -DELETE FROM tsdb_metrics WHERE timestamp < unixepoch() - 7*86400; -DELETE FROM tsdb_metrics_hour WHERE bucket < unixepoch() - 365*86400; -``` - -### 4. Query Engine - -**Purpose:** Serve metric queries via SQL and HTTP API - -**Key Methods:** -- `query_tsdb_metrics()` - Query with label filters -- `get_backend_health_metrics()` - Query backend health -- `get_tsdb_status()` - Get TSDB statistics - -**Label Filtering:** -```sql --- Query with label filter using JSON_EXTRACT -SELECT * FROM tsdb_metrics -WHERE metric_name = 'mysql_connections' -AND json_extract(labels, '$.hostgroup') = '1' -AND timestamp BETWEEN 1704067200 AND 1704153600; -``` - -## Data Flow Diagrams - -### Metric Ingestion Flow - -``` -┌──────────────────┐ -│ Metric Source │ -│ (Prometheus) │ -└────────┬─────────┘ - │ Collect() - ▼ -┌──────────────────┐ -│ tsdb_sampler_ │ -│ loop() │ -│ │ -│ • Extract labels│ -│ • Convert JSON │ -│ • Prepare stmt │ -└────────┬─────────┘ - │ INSERT - ▼ -┌──────────────────┐ -│ SQLite3 │ -│ (tsdb_metrics) │ -└──────────────────┘ -``` - -### Health Probe Flow - -``` -┌──────────────────┐ -│ runtime_mysql_ │ -│ servers │ -└────────┬─────────┘ - │ SELECT - ▼ -┌──────────────────┐ -│ tsdb_monitor_ │ -│ loop() │ -│ │ -│ • TCP connect │ -│ • Measure time │ -│ • Record status │ -└────────┬─────────┘ - │ INSERT - ▼ -┌──────────────────┐ -│ SQLite3 │ -│ (tsdb_backend_ │ -│ health) │ -└──────────────────┘ -``` - -## Thread Safety - -The TSDB uses SQLite's built-in concurrency control: - -1. **SQLite WAL Mode** - Write-Ahead Logging for concurrent reads/writes -2. **Prepared Statements** - Pre-compiled SQL for thread-safe execution -3. **No Additional Locks** - Relies on SQLite's internal locking +- Counter/Gauge/Untyped/Info: one sample per metric point. +- Summary: quantiles plus `_sum` and `_count` companion metrics. +- Histogram: `_bucket{le=...}` plus `_sum` and `_count` companion metrics. ## Storage Schema -### tsdb_metrics Table - -| Column | Type | Description | -|--------|------|-------------| -| timestamp | INT | Unix timestamp (seconds) | -| metric_name | TEXT | Metric identifier | -| labels | TEXT | JSON object with label key-value pairs | -| value | REAL | Metric value | - -**Primary Key:** (timestamp, metric_name) -**Storage:** WITHOUT ROWID - -### tsdb_metrics_hour Table - -| Column | Type | Description | -|--------|------|-------------| -| bucket | INT | Hour bucket (unix timestamp rounded to hour) | -| metric_name | TEXT | Metric identifier | -| labels | TEXT | JSON labels | -| avg_value | REAL | Average value in bucket | -| max_value | REAL | Maximum value in bucket | -| min_value | REAL | Minimum value in bucket | -| count | INT | Number of samples in bucket | +- `tsdb_metrics`: PK `(timestamp, metric_name, labels)` +- `tsdb_metrics_hour`: PK `(bucket, metric_name, labels)` +- `tsdb_backend_health`: PK `(timestamp, hostgroup, hostname, port)` -**Primary Key:** (bucket, metric_name) -**Storage:** WITHOUT ROWID +## Configuration Lifecycle -### tsdb_backend_health Table +TSDB configuration is part of ADMIN variables (`admin-stats_tsdb_*`) and is applied with: -| Column | Type | Description | -|--------|------|-------------| -| timestamp | INT | Unix timestamp | -| hostgroup | INT | Backend hostgroup ID | -| hostname | TEXT | Backend hostname | -| port | INT | Backend port | -| probe_up | INT | 1=success, 0=failure | -| connect_ms | INT | Connection time in milliseconds | +- `LOAD ADMIN VARIABLES TO RUNTIME` +- `SAVE ADMIN VARIABLES TO DISK` -**Primary Key:** (timestamp, hostgroup, hostname, port) -**Storage:** WITHOUT ROWID +No dedicated TSDB load/save command family is implemented. diff --git a/doc/tsdb/embedded_tsdb_metrics_catalog.md b/doc/tsdb/embedded_tsdb_metrics_catalog.md index c43bd6ac8..1b940ace7 100644 --- a/doc/tsdb/embedded_tsdb_metrics_catalog.md +++ b/doc/tsdb/embedded_tsdb_metrics_catalog.md @@ -1,31 +1,26 @@ # Embedded TSDB Metrics Catalog -The following metrics are curated and stored by the TSDB sampler. +The TSDB sampler records all metric families exposed by the built-in Prometheus registry. -## Traffic / Latency -| Metric | Type | Description | -| --- | --- | --- | -| `proxysql_queries_total` | Counter | Total number of queries processed. | -| `proxysql_query_errors_total` | Counter | Total number of query errors. | -| `proxysql_query_latency_ms` | Gauge | p50, p95, and p99 query latency (derived). | +## Family Coverage -## Connections -| Metric | Type | Description | -| --- | --- | --- | -| `proxysql_frontend_connections` | Gauge | Number of active client connections. | -| `proxysql_backend_connections` | Gauge | Number of active backend connections (by hostgroup/backend). | -| `proxysql_connection_pool_saturation` | Gauge | Percentage of pool usage. | +- Counter +- Gauge +- Summary +- Histogram +- Info +- Untyped -## Backend Health (Probes) -| Metric | Type | Description | -| --- | --- | --- | -| `backend_probe_up` | Gauge | 1 if backend is reachable, 0 otherwise. | -| `backend_probe_connect_ms` | Gauge | TCP connection latency in ms. | -| `backend_state` | Gauge | Current state of the backend (Enum). | +## Stored Series Conventions -## Proxy Health -| Metric | Type | Description | -| --- | --- | --- | -| `proxysql_uptime_seconds` | Gauge | Uptime of the ProxySQL process. | -| `proxysql_memory_bytes` | Gauge | Resident set size (RSS) memory usage. | -| `tsdb_series_count` | Gauge | Number of active series in the TSDB. | +- Counter/Gauge/Untyped/Info: stored as metric name exactly as exposed. +- Summary: + - quantiles in `` with `quantile` label + - `_sum` + - `_count` +- Histogram: + - buckets in `_bucket` with `le` label + - `_sum` + - `_count` + +Because the source registry can evolve, there is no fixed hardcoded metric list in TSDB code. diff --git a/doc/tsdb/embedded_tsdb_overview.md b/doc/tsdb/embedded_tsdb_overview.md index 93527849a..b7e09e991 100644 --- a/doc/tsdb/embedded_tsdb_overview.md +++ b/doc/tsdb/embedded_tsdb_overview.md @@ -1,184 +1,43 @@ # Embedded TSDB Overview -## What is the TSDB? +## What It Is -The **ProxySQL TSDB** (Time Series Database) is an **embedded, lightweight time-series database** built directly into ProxySQL using SQLite. It provides: +The ProxySQL TSDB is an embedded SQLite-based time-series store implemented in `ProxySQL_Statistics`. +It records Prometheus metrics and optional backend TCP probe health into `statsdb_disk`. -- **Historical metric storage** - Persist query statistics, connection counts, backend health -- **Built-in monitoring** - Active health checks for all backend servers -- **SQL interface** - Query metrics using standard SQL -- **HTTP API** - RESTful endpoints for metric retrieval -- **Prometheus integration** - Sample Prometheus metrics into TSDB +## What It Currently Implements -### Key Benefits +- Storage in SQLite tables: + - `tsdb_metrics` (raw samples) + - `tsdb_metrics_hour` (hourly rollups) + - `tsdb_backend_health` (backend probes) +- Periodic sampling from the built-in Prometheus registry. +- Sampling of all Prometheus metric families: + - Counter, Gauge, Summary, Histogram, Info, Untyped. +- Optional backend TCP probe loop for `runtime_mysql_servers`. +- Hourly downsampling and retention cleanup. +- Query access via SQL on `statsdb_disk.*` tables. -| Benefit | Description | -|---------|-------------| -| **Zero external dependencies** | Uses existing SQLite infrastructure, no additional services required | -| **Always-on monitoring** | Built-in backend health checks | -| **Historical analysis** | Debug past performance issues | -| **Simple setup** | Enable with a single SQL command | -| **Production-ready** | Leverages battle-tested ProxySQL_Statistics system | -| **SQL queries** | Query metrics using familiar SQL syntax | +## Configuration Model -## Architecture Overview +TSDB settings are standard ADMIN variables and use the normal ADMIN load/save lifecycle: -The TSDB subsystem extends the existing `ProxySQL_Statistics` module with three SQLite tables: +- `SET admin-stats_tsdb_...` +- `LOAD ADMIN VARIABLES TO RUNTIME` +- `SAVE ADMIN VARIABLES TO DISK` -``` -┌─────────────────────────────────────────────────────────────┐ -│ ProxySQL_Statistics │ -│ (SQLite) │ -├─────────────────────────────────────────────────────────────┤ -│ ┌──────────────────┐ ┌──────────────────┐ │ -│ │ tsdb_metrics │ │ tsdb_metrics_hour│ │ -│ │ (raw data) │ │ (downsampled) │ │ -│ └──────────────────┘ └──────────────────┘ │ -│ ┌──────────────────┐ │ -│ │tsdb_backend_health│ │ -│ │ (health probes) │ │ -│ └──────────────────┘ │ -└─────────────────────────────────────────────────────────────┘ -``` +There is no separate `LOAD/SAVE TSDB VARIABLES ...` command set. -### Data Flow +## Variables -``` -┌──────────────────┐ ┌──────────────────┐ -│ Prometheus │ │ Backend Servers │ -│ Registry │ │ │ -└────────┬─────────┘ └────────┬─────────┘ - │ │ - ▼ ▼ -┌──────────────────┐ ┌──────────────────┐ -│ tsdb_sampler_ │ │ tsdb_monitor_ │ -│ loop() │ │ loop() │ -└────────┬─────────┘ └────────┬─────────┘ - │ │ - └──────────┬─────────────┘ - ▼ - ┌──────────────────┐ - │ SQLite Tables │ - │ (tsdb_metrics) │ - └────────┬─────────┘ - │ - ┌──────────┼──────────┐ - ▼ ▼ ▼ - ┌──────────┐ ┌────────┐ ┌──────────┐ - │ HTTP API │ │ Admin │ │ Automatic│ - │ │ │Commands│ │Compaction│ - └──────────┘ └────────┘ └──────────┘ -``` +- `admin-stats_tsdb_enabled` (0/1) +- `admin-stats_tsdb_sample_interval` (1..3600 seconds) +- `admin-stats_tsdb_retention_days` (1..3650) +- `admin-stats_tsdb_monitor_enabled` (0/1) +- `admin-stats_tsdb_monitor_interval` (1..3600 seconds) -### Storage Architecture +## Retention -| Table | Purpose | Retention | -|-------|---------|-----------| -| `tsdb_metrics` | Raw metric samples | 7 days | -| `tsdb_metrics_hour` | Hourly aggregates (avg, max, min, count) | 1 year | -| `tsdb_backend_health` | Backend health probe results | 7 days | - -### Automatic Downsampling - -``` -Raw Data (tsdb_metrics) Hourly Aggregates (tsdb_metrics_hour) -┌─────────────────────┐ ┌──────────────────────────────────────┐ -│ Time │ Metric │ Val│ │ Bucket │ Metric │ Avg │ Max │ Min │ N│ -├─────────────────────┤ ├──────────────────────────────────────┤ -│ 10:01│ cpu │ 50│ ──▶ │ 10:00 │ cpu │ 52 │ 80 │ 20 │12│ -│ 10:05│ cpu │ 55│ ──▶ │ 11:00 │ cpu │ 48 │ 75 │ 25 │12│ -│ 10:10│ cpu │ 60│ ──▶ │ ... │ │ │ │ │ │ -│ ... │ │ │ └──────────────────────────────────────┘ -└─────────────────────┘ - 7 days 1 year -``` - -## Quick Start - -```sql --- Enable TSDB -SET admin-stats_tsdb_enabled='true'; -LOAD ADMIN VARIABLES TO RUNTIME; - --- Access metrics via SQL -SELECT * FROM statsdb_disk.tsdb_metrics -WHERE metric_name = 'mysql_connections' -AND timestamp > unixepoch() - 3600; - --- Query via HTTP API --- curl "http://admin:admin@localhost:6032/api/tsdb/query?metric=mysql_connections&from=-1h" -``` - -## What Gets Monitored? - -### 1. Traffic Metrics (from Prometheus Registry) -- Query counts and latency -- Frontend connections -- Backend connections -- Query cache stats - -### 2. Backend Health (active probes) -- TCP connect success/failure -- Connection latency (milliseconds) -- Per-hostgroup, per-host, per-port tracking - -### 3. System Health -- TSDB internal stats -- Storage usage - -## Design Principles - -### SQLite-Based Storage - -- **ACID compliance** - Transactions ensure data integrity -- **Indexed queries** - Fast time-range lookups -- **JSON labels** - Flexible label storage with JSON extraction -- **SQL interface** - Query using standard SQL - -### Bounded Resources - -| Limit | Default | Purpose | -|-------|---------|---------| -| `stats_tsdb_retention_days` | 7 | Raw data retention | -| `stats_tsdb_sample_interval` | 5 sec | Metric sampling rate | -| `stats_tsdb_monitor_interval` | 10 sec | Health probe interval | - -### Thread Safety - -- Uses existing `ProxySQL_Statistics` locking -- SQLite prepared statements for concurrent access -- Atomic configuration updates - -## Configuration at a Glance - -| Variable | Default | Description | -|----------|---------|-------------| -| `stats_tsdb_enabled` | `0` | Master switch (0=off, 1=on) | -| `stats_tsdb_sample_interval` | `5` | How often to sample metrics (seconds) | -| `stats_tsdb_retention_days` | `7` | How long to keep raw data | -| `stats_tsdb_monitor_enabled` | `0` | Enable backend health monitoring | -| `stats_tsdb_monitor_interval` | `10` | How often to probe backends (seconds) | - -See the [Reference Manual](./embedded_tsdb_reference.md) for complete configuration documentation. - -## When to Use TSDB - -| Use Case | Recommended | -|----------|-------------| -| **Quick troubleshooting** | Ideal - last 7 days of data | -| **Performance analysis** | Identify slow queries, bottlenecks | -| **Backend monitoring** | Track backend health over time | -| **Long-term analytics** | Use external system (Prometheus/Grafana) | - -## When NOT to Use TSDB - -- **Long-term retention** (> 1 week) - Use external TSDB -- **High-cardinality data** - Thousands of unique series may impact performance -- **Complex analytics** - Use external query/BI tools -- **Cross-server aggregation** - Use external monitoring - -## Next Steps - -1. **Read the Architecture** - [architecture.md](./embedded_tsdb_architecture.md) -2. **Explore the API** - [reference.md](./embedded_tsdb_reference.md) -3. **Check the Specs** - [specs.md](./embedded_tsdb_specs.md) +- Raw metrics (`tsdb_metrics`): `admin-stats_tsdb_retention_days` +- Backend probes (`tsdb_backend_health`): `admin-stats_tsdb_retention_days` +- Hourly rollups (`tsdb_metrics_hour`): fixed 365 days diff --git a/doc/tsdb/embedded_tsdb_quickstart.md b/doc/tsdb/embedded_tsdb_quickstart.md index 619f04d57..2f1bbf11c 100644 --- a/doc/tsdb/embedded_tsdb_quickstart.md +++ b/doc/tsdb/embedded_tsdb_quickstart.md @@ -1,168 +1,58 @@ # TSDB Quickstart Guide -## Enable TSDB +## 1. Enable TSDB ```sql --- Enable TSDB SET admin-stats_tsdb_enabled='1'; LOAD ADMIN VARIABLES TO RUNTIME; SAVE ADMIN VARIABLES TO DISK; ``` -## Verify Tables Created +Optional backend probe collection: ```sql --- Check TSDB tables exist -SELECT name FROM statsdb_disk.sqlite_master -WHERE type='table' AND name LIKE 'tsdb%'; -``` - -Expected output: -``` -+----------------------+ -| name | -+----------------------+ -| tsdb_metrics | -| tsdb_metrics_hour | -| tsdb_backend_health | -+----------------------+ +SET admin-stats_tsdb_monitor_enabled='1'; +LOAD ADMIN VARIABLES TO RUNTIME; +SAVE ADMIN VARIABLES TO DISK; ``` -## Query Metrics - -### View Recent Metrics +## 2. Verify Tables ```sql --- Last hour of metrics -SELECT - datetime(timestamp, 'unixepoch') as time, - metric_name, - json_extract(labels, '$.hostgroup') as hg, - value -FROM statsdb_disk.tsdb_metrics -WHERE timestamp > unixepoch() - 3600 -ORDER BY timestamp DESC -LIMIT 10; +SELECT name +FROM statsdb_disk.sqlite_master +WHERE type='table' AND name LIKE 'tsdb_%'; ``` -### Query with Filters +## 3. Query Raw Samples ```sql --- Filter by metric name and label -SELECT * FROM statsdb_disk.tsdb_metrics -WHERE metric_name = 'mysql_connections' -AND json_extract(labels, '$.hostgroup') = '1' -AND timestamp > unixepoch() - 3600; +SELECT datetime(timestamp,'unixepoch') AS ts, metric_name, labels, value +FROM statsdb_disk.tsdb_metrics +WHERE timestamp > unixepoch() - 300 +ORDER BY timestamp DESC +LIMIT 50; ``` -### Hourly Aggregates +## 4. Query Hourly Rollups ```sql --- Daily averages -SELECT - datetime(bucket, 'unixepoch') as hour, - metric_name, - avg_value, - max_value, - count +SELECT datetime(bucket,'unixepoch') AS hour, metric_name, avg_value, max_value, min_value, count FROM statsdb_disk.tsdb_metrics_hour WHERE bucket > unixepoch() - 86400 ORDER BY bucket; ``` -## Enable Backend Health Monitoring +## 5. Query Backend Probe Health ```sql --- Enable health probes -SET admin-stats_tsdb_monitor_enabled='1'; -LOAD ADMIN VARIABLES TO RUNTIME; -SAVE ADMIN VARIABLES TO DISK; - --- Query health status -SELECT - datetime(timestamp, 'unixepoch') as time, - hostgroup, - hostname, - port, - CASE probe_up WHEN 1 THEN 'UP' ELSE 'DOWN' END as status, - connect_ms +SELECT datetime(timestamp,'unixepoch') AS ts, hostgroup, hostname, port, probe_up, connect_ms FROM statsdb_disk.tsdb_backend_health WHERE timestamp > unixepoch() - 3600 ORDER BY timestamp DESC; ``` -## Common Queries - -### Connection Count Over Time - -```sql -SELECT - datetime(timestamp, 'unixepoch') as time, - value as connections -FROM statsdb_disk.tsdb_metrics -WHERE metric_name = 'mysql_connections' -AND timestamp > unixepoch() - 3600 -ORDER BY timestamp; -``` - -### Backend Availability +## Notes -```sql --- Success rate by backend -SELECT - hostname, - port, - SUM(probe_up) as up_count, - COUNT(*) as total, - ROUND(100.0 * SUM(probe_up) / COUNT(*), 2) as uptime_pct -FROM statsdb_disk.tsdb_backend_health -WHERE timestamp > unixepoch() - 86400 -GROUP BY hostname, port; -``` - -### Top Metrics - -```sql --- Most frequent metrics -SELECT - metric_name, - COUNT(*) as samples -FROM statsdb_disk.tsdb_metrics -WHERE timestamp > unixepoch() - 3600 -GROUP BY metric_name -ORDER BY samples DESC; -``` - -## Disable TSDB - -```sql -SET admin-stats_tsdb_enabled='0'; -SET admin-stats_tsdb_monitor_enabled='0'; -LOAD ADMIN VARIABLES TO RUNTIME; -SAVE ADMIN VARIABLES TO DISK; -``` - -## Troubleshooting - -### Check if Tables Exist - -```sql -SELECT - (SELECT COUNT(*) FROM statsdb_disk.sqlite_master WHERE name='tsdb_metrics') as metrics_exists, - (SELECT COUNT(*) FROM statsdb_disk.sqlite_master WHERE name='tsdb_metrics_hour') as hourly_exists, - (SELECT COUNT(*) FROM statsdb_disk.sqlite_master WHERE name='tsdb_backend_health') as health_exists; -``` - -### Check Recent Activity - -```sql --- Metrics in last 5 minutes -SELECT COUNT(*) as recent_metrics -FROM statsdb_disk.tsdb_metrics -WHERE timestamp > unixepoch() - 300; - --- Health probes in last 5 minutes -SELECT COUNT(*) as recent_probes -FROM statsdb_disk.tsdb_backend_health -WHERE timestamp > unixepoch() - 300; -``` +- TSDB settings are ADMIN variables (`admin-stats_tsdb_*`). +- There are no `LOAD/SAVE TSDB VARIABLES` commands. diff --git a/doc/tsdb/embedded_tsdb_reference.md b/doc/tsdb/embedded_tsdb_reference.md index d31ae42b9..7e2a655c6 100644 --- a/doc/tsdb/embedded_tsdb_reference.md +++ b/doc/tsdb/embedded_tsdb_reference.md @@ -2,192 +2,56 @@ ## Configuration Variables -All TSDB configuration variables are managed through the standard ProxySQL admin variables system with prefix `admin-stats_tsdb_`. +TSDB variables are ADMIN variables: -### Complete Variable Reference +| Variable | Type | Default | Range | Description | +|---|---|---:|---|---| +| `admin-stats_tsdb_enabled` | int | `0` | `0/1` | Master switch | +| `admin-stats_tsdb_sample_interval` | int | `5` | `1..3600` | Prometheus sampling interval (seconds) | +| `admin-stats_tsdb_retention_days` | int | `7` | `1..3650` | Raw/probe retention in days | +| `admin-stats_tsdb_monitor_enabled` | int | `0` | `0/1` | Backend probe switch | +| `admin-stats_tsdb_monitor_interval` | int | `10` | `1..3600` | Probe interval (seconds) | -| Variable | Type | Default | Description | -|----------|------|---------|-------------| -| `admin-stats_tsdb_enabled` | integer | `0` | Master switch (0=off, 1=on) | -| `admin-stats_tsdb_sample_interval` | integer | `5` | Metric sampling frequency (seconds) | -| `admin-stats_tsdb_retention_days` | integer | `7` | Raw data retention (days) | -| `admin-stats_tsdb_monitor_enabled` | integer | `0` | Enable backend health monitoring | -| `admin-stats_tsdb_monitor_interval` | integer | `10` | Health probe frequency (seconds) | - -### Enabling TSDB +### Apply Changes ```sql --- Enable TSDB SET admin-stats_tsdb_enabled='1'; LOAD ADMIN VARIABLES TO RUNTIME; SAVE ADMIN VARIABLES TO DISK; - --- Check status -SELECT * FROM global_variables WHERE variable_name LIKE 'admin-stats_tsdb%'; ``` -## Admin Commands - -### TSDB STATUS - -Returns TSDB statistics including series count, datapoints, and time range. +No dedicated `LOAD/SAVE TSDB VARIABLES ...` command exists. -```sql --- Get TSDB status (when implemented in admin handler) -SELECT - (SELECT COUNT(DISTINCT metric_name || labels) FROM statsdb_disk.tsdb_metrics) as total_series, - (SELECT COUNT(*) FROM statsdb_disk.tsdb_metrics) as total_datapoints, - (SELECT MIN(timestamp) FROM statsdb_disk.tsdb_metrics) as oldest_datapoint, - (SELECT MAX(timestamp) FROM statsdb_disk.tsdb_metrics) as newest_datapoint; -``` +## Tables -### Querying Metrics via SQL - -```sql --- Query recent metrics -SELECT * FROM statsdb_disk.tsdb_metrics -WHERE metric_name = 'mysql_connections' -AND timestamp > unixepoch() - 3600 -ORDER BY timestamp DESC -LIMIT 100; - --- Query with label filter -SELECT * FROM statsdb_disk.tsdb_metrics -WHERE metric_name = 'mysql_connections' -AND json_extract(labels, '$.hostgroup') = '1' -AND timestamp > unixepoch() - 3600; - --- Query hourly aggregates -SELECT - bucket, - metric_name, - avg_value, - max_value, - min_value, - count -FROM statsdb_disk.tsdb_metrics_hour -WHERE metric_name = 'mysql_connections' -AND bucket > unixepoch() - 86400 -ORDER BY bucket; - --- Query backend health -SELECT - datetime(timestamp, 'unixepoch') as time, - hostgroup, - hostname, - port, - probe_up, - connect_ms -FROM statsdb_disk.tsdb_backend_health -WHERE timestamp > unixepoch() - 3600 -ORDER BY timestamp DESC; -``` - -## C++ API - -### Metric Insertion - -```cpp -// Insert a metric sample -void ProxySQL_Statistics::insert_tsdb_metric( - const std::string& metric_name, - const std::map& labels, - double value, - time_t timestamp = time(NULL) -); - -// Example usage -std::map labels; -labels["hostgroup"] = "1"; -labels["backend"] = "192.168.1.1"; -GloProxyStats->insert_tsdb_metric("connections", labels, 42.0, time(NULL)); -``` - -### Backend Health Insertion - -```cpp -// Insert backend health probe result -void ProxySQL_Statistics::insert_backend_health( - int hostgroup, - const std::string& hostname, - int port, - bool probe_up, - int connect_ms, - time_t timestamp = time(NULL) -); - -// Example usage -GloProxyStats->insert_backend_health( - 1, "192.168.1.1", 3306, true, 5, time(NULL) -); -``` - -### Query Interface - -```cpp -// Query metrics with label filters -SQLite3_result* ProxySQL_Statistics::query_tsdb_metrics( - const std::string& metric_name, - const std::map& label_filters, - time_t from, - time_t to, - const std::string& aggregation = "" -); - -// Query backend health -SQLite3_result* ProxySQL_Statistics::get_backend_health_metrics( - time_t from, - time_t to, - int hostgroup = -1 -); - -// Get TSDB status -struct tsdb_status_t { - size_t total_series; - size_t total_datapoints; - size_t disk_size_bytes; - time_t oldest_datapoint; - time_t newest_datapoint; -}; -tsdb_status_t ProxySQL_Statistics::get_tsdb_status(); -``` - -## Database Schema - -### tsdb_metrics - -Stores raw metric samples. +### `statsdb_disk.tsdb_metrics` ```sql CREATE TABLE tsdb_metrics ( timestamp INT NOT NULL, metric_name TEXT NOT NULL, - labels TEXT, + labels TEXT NOT NULL DEFAULT '{}', value REAL, - PRIMARY KEY (timestamp, metric_name) + PRIMARY KEY (timestamp, metric_name, labels) ) WITHOUT ROWID; ``` -### tsdb_metrics_hour - -Stores hourly aggregated metrics. +### `statsdb_disk.tsdb_metrics_hour` ```sql CREATE TABLE tsdb_metrics_hour ( bucket INT NOT NULL, metric_name TEXT NOT NULL, - labels TEXT, + labels TEXT NOT NULL DEFAULT '{}', avg_value REAL, max_value REAL, min_value REAL, count INT, - PRIMARY KEY (bucket, metric_name) + PRIMARY KEY (bucket, metric_name, labels) ) WITHOUT ROWID; ``` -### tsdb_backend_health - -Stores backend health probe results. +### `statsdb_disk.tsdb_backend_health` ```sql CREATE TABLE tsdb_backend_health ( @@ -201,69 +65,35 @@ CREATE TABLE tsdb_backend_health ( ) WITHOUT ROWID; ``` -## Metrics Catalog - -### System Metrics - -| Metric Name | Type | Labels | Description | -|-------------|------|--------|-------------| -| `mysql_connections` | gauge | hostgroup, backend | Active MySQL connections | -| `queries` | counter | hostgroup, backend | Total queries processed | -| `slow_queries` | counter | - | Slow query count | +## Prometheus Ingestion Mapping -### Backend Health Metrics +- `Counter` -> `` +- `Gauge` -> `` +- `Untyped` -> `` +- `Info` -> `` +- `Summary` -> `{quantile=...}`, plus `_sum`, `_count` +- `Histogram` -> `_bucket{le=...}`, plus `_sum`, `_count` -| Column | Description | -|--------|-------------| -| `probe_up` | 1 if TCP connect succeeded, 0 if failed | -| `connect_ms` | TCP connection time in milliseconds | +## SQL Examples -## Retention and Downsampling - -### Automatic Downsampling - -Raw data is automatically downsampled to hourly aggregates: - -``` -tsdb_metrics (raw) → tsdb_metrics_hour (aggregated) - 7 days 1 year +```sql +SELECT metric_name, labels, value +FROM statsdb_disk.tsdb_metrics +WHERE timestamp > unixepoch() - 300 +ORDER BY timestamp DESC +LIMIT 50; ``` -### Retention Policy - -- **Raw data** (`tsdb_metrics`): 7 days -- **Hourly aggregates** (`tsdb_metrics_hour`): 1 year -- **Backend health** (`tsdb_backend_health`): 7 days - -## Troubleshooting - -### Checking TSDB Status - ```sql --- Check if TSDB tables exist -SELECT name FROM statsdb_disk.sqlite_master WHERE type='table' AND name LIKE 'tsdb%'; - --- Check recent metrics count -SELECT COUNT(*) FROM statsdb_disk.tsdb_metrics WHERE timestamp > unixepoch() - 3600; - --- Check table sizes -SELECT - name, - (SELECT COUNT(*) FROM statsdb_disk.tsdb_metrics) as metrics_count, - (SELECT COUNT(*) FROM statsdb_disk.tsdb_metrics_hour) as hourly_count, - (SELECT COUNT(*) FROM statsdb_disk.tsdb_backend_health) as health_count; +SELECT datetime(bucket, 'unixepoch') AS hour, metric_name, avg_value, max_value, min_value, count +FROM statsdb_disk.tsdb_metrics_hour +WHERE bucket > unixepoch() - 86400 +ORDER BY bucket; ``` -### Common Issues - -**Issue: No metrics being collected** -- Check `admin-stats_tsdb_enabled` is set to 1 -- Verify `GloVars.prometheus_registry` is initialized - -**Issue: Backend health not recorded** -- Check `admin-stats_tsdb_monitor_enabled` is set to 1 -- Ensure `runtime_mysql_servers` has entries - -**Issue: High disk usage** -- Reduce `admin-stats_tsdb_retention_days` -- Increase `admin-stats_tsdb_sample_interval` +```sql +SELECT datetime(timestamp, 'unixepoch') AS ts, hostgroup, hostname, port, probe_up, connect_ms +FROM statsdb_disk.tsdb_backend_health +WHERE timestamp > unixepoch() - 3600 +ORDER BY timestamp DESC; +``` diff --git a/doc/tsdb/embedded_tsdb_specs.md b/doc/tsdb/embedded_tsdb_specs.md index 0af9d49cd..c2f9f793f 100644 --- a/doc/tsdb/embedded_tsdb_specs.md +++ b/doc/tsdb/embedded_tsdb_specs.md @@ -1,223 +1,47 @@ # TSDB Technical Specifications -## Overview - -The TSDB (Time Series Database) subsystem extends ProxySQL_Statistics with time-series metric storage using SQLite. - -## Design Goals - -1. **Zero External Dependencies** - Uses existing SQLite infrastructure -2. **Production Ready** - Leverages battle-tested ProxySQL_Statistics -3. **Automatic Maintenance** - Built-in downsampling and retention -4. **SQL Interface** - Query using standard SQL - -## Storage Engine - -### SQLite Configuration - -- **Journal Mode**: WAL (Write-Ahead Logging) -- **Synchronous**: NORMAL -- **Page Size**: 4096 bytes -- **Cache Size**: -2000 (2MB) - -### Table Schema - -#### tsdb_metrics - -```sql -CREATE TABLE tsdb_metrics ( - timestamp INT NOT NULL, - metric_name TEXT NOT NULL, - labels TEXT, -- JSON object - value REAL, - PRIMARY KEY (timestamp, metric_name) -) WITHOUT ROWID; -``` - -**Rationale:** -- `WITHOUT ROWID` for efficient time-range scans -- Composite PK on (timestamp, metric_name) for fast lookups -- JSON labels for flexible metadata +## Scope -#### tsdb_metrics_hour +Embedded time-series storage in SQLite for ProxySQL runtime metrics and backend probe health. -```sql -CREATE TABLE tsdb_metrics_hour ( - bucket INT NOT NULL, - metric_name TEXT NOT NULL, - labels TEXT, - avg_value REAL, - max_value REAL, - min_value REAL, - count INT, - PRIMARY KEY (bucket, metric_name) -) WITHOUT ROWID; -``` +## Table Definitions -**Rationale:** -- Pre-aggregated data for fast historical queries -- Statistical summary (avg, max, min, count) +### `tsdb_metrics` -#### tsdb_backend_health +- Columns: `timestamp`, `metric_name`, `labels`, `value` +- PK: `(timestamp, metric_name, labels)` +- `labels` is JSON text, default `'{}'` -```sql -CREATE TABLE tsdb_backend_health ( - timestamp INT NOT NULL, - hostgroup INT NOT NULL, - hostname TEXT NOT NULL, - port INT NOT NULL, - probe_up INT NOT NULL, - connect_ms INT, - PRIMARY KEY (timestamp, hostgroup, hostname, port) -) WITHOUT ROWID; -``` - -**Rationale:** -- Tracks backend availability over time -- Composite PK for efficient per-backend queries - -## Downsampling Algorithm - -### Hourly Aggregation - -```sql -INSERT OR REPLACE INTO tsdb_metrics_hour -SELECT - (timestamp/3600)*3600 as bucket, - metric_name, - labels, - AVG(value) as avg_value, - MAX(value) as max_value, - MIN(value) as min_value, - COUNT(*) as count -FROM tsdb_metrics -WHERE timestamp >= :last_processed AND timestamp < :current_hour -GROUP BY bucket, metric_name, labels; -``` - -### Retention Enforcement - -```sql --- Raw data: 7 days -DELETE FROM tsdb_metrics WHERE timestamp < unixepoch() - 7*86400; - --- Hourly: 1 year -DELETE FROM tsdb_metrics_hour WHERE bucket < unixepoch() - 365*86400; - --- Health: 7 days -DELETE FROM tsdb_backend_health WHERE timestamp < unixepoch() - 7*86400; -``` - -## Performance Characteristics - -### Insert Performance - -- **Raw metrics**: ~10,000 inserts/second -- **Health probes**: ~5,000 inserts/second -- **Batched**: No (individual prepared statements) - -### Query Performance - -- **Time-range scan**: O(log n) with index -- **Label filter**: Full scan (JSON extraction) -- **Aggregated query**: O(log n) on tsdb_metrics_hour - -### Storage Overhead - -| Data Type | Per-Row Size | -|-----------|--------------| -| Raw metric | ~50 bytes + label JSON | -| Hourly aggregate | ~60 bytes + label JSON | -| Health probe | ~40 bytes | - -## Memory Usage - -- **Prepared statements**: 3 cached statements -- **JSON parsing**: Temporary during insert/query -- **Result sets**: Streamed to client - -## Concurrency - -### Thread Safety - -- SQLite handles concurrency via WAL mode -- Multiple readers, single writer -- No additional application-level locks - -### Connection Model - -- Uses existing `statsdb_disk` connection -- Prepared statements cached per-thread - -## API Specification - -### C++ Interface - -```cpp -class ProxySQL_Statistics { -public: - // Metric insertion - void insert_tsdb_metric(const std::string& metric_name, - const std::map& labels, - double value, time_t timestamp); - - // Health insertion - void insert_backend_health(int hostgroup, const std::string& hostname, - int port, bool probe_up, int connect_ms, - time_t timestamp); - - // Downsampling - void tsdb_downsample_metrics(); - - // Query - SQLite3_result* query_tsdb_metrics(const std::string& metric_name, - const std::map& label_filters, - time_t from, time_t to, - const std::string& aggregation); - - // Status - struct tsdb_status_t { - size_t total_series; - size_t total_datapoints; - size_t disk_size_bytes; - time_t oldest_datapoint; - time_t newest_datapoint; - }; - tsdb_status_t get_tsdb_status(); -}; -``` - -## Configuration - -### Variables - -| Variable | Type | Default | Description | -|----------|------|---------|-------------| -| stats_tsdb_enabled | bool | false | Master switch | -| stats_tsdb_sample_interval | int | 5 | Sampling interval (seconds) | -| stats_tsdb_retention_days | int | 7 | Raw retention (days) | -| stats_tsdb_monitor_enabled | bool | false | Health monitoring | -| stats_tsdb_monitor_interval | int | 10 | Probe interval (seconds) | - -## Testing - -### Unit Tests - -1. **Insert/Query roundtrip** -2. **Label filtering** -3. **Downsampling accuracy** -4. **Retention enforcement** - -### Integration Tests - -1. **Backend health monitoring** -2. **Prometheus metrics sampling** -3. **Concurrent access** -4. **Resource limits** - -## Future Enhancements - -1. **Label indexing** - Add GIN index on labels JSON -2. **Continuous queries** - User-defined aggregations -3. **Export formats** - CSV, JSON output -4. **Alerting** - Basic threshold alerts +### `tsdb_metrics_hour` + +- Columns: `bucket`, `metric_name`, `labels`, `avg_value`, `max_value`, `min_value`, `count` +- PK: `(bucket, metric_name, labels)` + +### `tsdb_backend_health` + +- Columns: `timestamp`, `hostgroup`, `hostname`, `port`, `probe_up`, `connect_ms` +- PK: `(timestamp, hostgroup, hostname, port)` + +## Sampling and Rollup + +- Sampler interval: `admin-stats_tsdb_sample_interval` +- Rollup interval: hourly +- Rollup SQL: `INSERT OR REPLACE ... GROUP BY bucket, metric_name, labels` + +## Retention + +- Raw metrics retention: `admin-stats_tsdb_retention_days` +- Backend probe retention: `admin-stats_tsdb_retention_days` +- Hourly rollup retention: 365 days + +## Variable Semantics + +- All TSDB variables are ADMIN variables (`admin-stats_tsdb_*`). +- Applied through standard ADMIN commands only. +- No dedicated `LOAD/SAVE TSDB VARIABLES` command set. + +## Current API Surface + +- C++ methods in `ProxySQL_Statistics` for insert, query, status, sampler/monitor loops. +- SQL querying through `statsdb_disk.tsdb_*` tables. +- No dedicated HTTP TSDB endpoint implementation at this time. diff --git a/doc/tsdb/ui_endpoints.md b/doc/tsdb/ui_endpoints.md index e0db39135..f0722b944 100644 --- a/doc/tsdb/ui_endpoints.md +++ b/doc/tsdb/ui_endpoints.md @@ -1,19 +1,6 @@ -# TSDB HTTP API Endpoints +# TSDB UI / HTTP Endpoints -The TSDB exposes a JSON API for the built-in UI and external queries. All endpoints require Admin authentication. +No dedicated TSDB REST endpoints are currently implemented in ProxySQL. -## `GET /api/tsdb/metrics` -Returns a list of all metrics currently stored in the TSDB. - -## `GET /api/tsdb/query` -Retrieves time series data for a specific metric. -**Parameters:** -- `metric`: (Required) Name of the metric. -- `from`: (Required) Start timestamp (Unix ms). -- `to`: (Required) End timestamp (Unix ms). -- `step`: (Required) Resolution in seconds. -- `labels`: (Optional) Filter by labels (e.g., `hostgroup=10`). -- `agg`: (Optional) Aggregation function (`avg`, `max`, `rate`). - -## `GET /api/tsdb/status` -Returns runtime statistics for the TSDB subsystem, including disk usage and series cardinality. +Current access path is SQL via admin connection, querying `statsdb_disk.tsdb_*` tables. +Any future TSDB HTTP API should be documented here once endpoint handlers are added. diff --git a/include/ProxySQL_Statistics.hpp b/include/ProxySQL_Statistics.hpp index 0fa7b0651..07a926438 100644 --- a/include/ProxySQL_Statistics.hpp +++ b/include/ProxySQL_Statistics.hpp @@ -98,9 +98,9 @@ "CREATE TABLE tsdb_metrics (\"\ timestamp INT NOT NULL,\"\ metric_name TEXT NOT NULL,\"\ - labels TEXT,\"\ + labels TEXT NOT NULL DEFAULT '{}',\"\ value REAL,\"\ - PRIMARY KEY (timestamp, metric_name)\"\ + PRIMARY KEY (timestamp, metric_name, labels)\"\ ) WITHOUT ROWID" // Hourly downsampled table @@ -108,12 +108,12 @@ "CREATE TABLE tsdb_metrics_hour (\"\ bucket INT NOT NULL,\"\ metric_name TEXT NOT NULL,\"\ - labels TEXT,\"\ + labels TEXT NOT NULL DEFAULT '{}',\"\ avg_value REAL,\"\ max_value REAL,\"\ min_value REAL,\"\ count INT,\"\ - PRIMARY KEY (bucket, metric_name)\"\ + PRIMARY KEY (bucket, metric_name, labels)\"\ ) WITHOUT ROWID" // Backend health monitoring table diff --git a/include/proxysql_admin.h b/include/proxysql_admin.h index b2ba7bcf5..d8ff56f16 100644 --- a/include/proxysql_admin.h +++ b/include/proxysql_admin.h @@ -328,6 +328,11 @@ class ProxySQL_Admin { int stats_mysql_eventslog_sync_buffer_to_disk; int stats_system_cpu; int stats_system_memory; + int stats_tsdb_enabled; + int stats_tsdb_sample_interval; + int stats_tsdb_retention_days; + int stats_tsdb_monitor_enabled; + int stats_tsdb_monitor_interval; bool restapi_enabled; bool restapi_enabled_old; int restapi_port; @@ -491,10 +496,6 @@ class ProxySQL_Admin { void flush_pgsql_variables___database_to_runtime(SQLite3DB* db, bool replace, const std::string& checksum = "", const time_t epoch = 0); // - // TSDB - void flush_tsdb_variables___runtime_to_database(SQLite3DB *db, bool replace, bool del, bool onlyifempty, bool runtime=false); - void flush_tsdb_variables___database_to_runtime(SQLite3DB *db, bool replace, const std::string& checksum = "", const time_t epoch = 0); - #ifdef PROXYSQLGENAI // GenAI void flush_genai_variables___runtime_to_database(SQLite3DB* db, bool replace, bool del, bool onlyifempty, bool runtime = false, bool use_lock = true); @@ -810,10 +811,6 @@ class ProxySQL_Admin { void load_pgsql_variables_to_runtime(const std::string& checksum = "", const time_t epoch = 0) { flush_pgsql_variables___database_to_runtime(admindb, true, checksum, epoch); } void save_pgsql_variables_from_runtime() { flush_pgsql_variables___runtime_to_database(admindb, true, true, false); } - // TSDB - void load_tsdb_variables_to_runtime(const std::string& checksum = "", const time_t epoch = 0) { flush_tsdb_variables___database_to_runtime(admindb, true, checksum, epoch); } - void save_tsdb_variables_from_runtime() { flush_tsdb_variables___runtime_to_database(admindb, true, true, false); } - #ifdef PROXYSQLGENAI //GenAI void init_genai_variables(); diff --git a/lib/Admin_FlushVariables.cpp b/lib/Admin_FlushVariables.cpp index 386d93ca6..6045e6d54 100644 --- a/lib/Admin_FlushVariables.cpp +++ b/lib/Admin_FlushVariables.cpp @@ -1508,114 +1508,3 @@ void ProxySQL_Admin::flush_mcp_variables___runtime_to_database(SQLite3DB* db, bo free(varnames); } #endif /* PROXYSQLGENAI */ - -// TSDB VARIABLES -void ProxySQL_Admin::flush_tsdb_variables___runtime_to_database(SQLite3DB *db, bool replace, bool del, bool onlyifempty, bool runtime) { - proxy_debug(PROXY_DEBUG_ADMIN, 4, "Flushing TSDB variables. Replace:%d, Delete:%d, Only_If_Empty:%d\n", replace, del, onlyifempty); - if (onlyifempty) { - char *error=NULL; - int cols=0; - int affected_rows=0; - SQLite3_result *resultset=NULL; - char *q=(char *)"SELECT COUNT(*) FROM global_variables WHERE variable_name LIKE 'tsdb-%'"; - db->execute_statement(q, &error , &cols , &affected_rows , &resultset); - int matching_rows=0; - if (error) { - proxy_error("Error on %s : %s\n", q, error); - return; - } else { - for (std::vector::iterator it = resultset->rows.begin() ; it != resultset->rows.end(); ++it) { - SQLite3_row *r=*it; - matching_rows+=atoi(r->fields[0]); - } - } - if (resultset) delete resultset; - if (matching_rows) { - proxy_debug(PROXY_DEBUG_ADMIN, 4, "Table global_variables has TSDB variables - skipping\n"); - return; - } - } - if (del) { - proxy_debug(PROXY_DEBUG_ADMIN, 4, "Deleting TSDB variables from global_variables\n"); - db->execute("DELETE FROM global_variables WHERE variable_name LIKE 'tsdb-%'"); - } - if (runtime) { - db->execute("DELETE FROM runtime_global_variables WHERE variable_name LIKE 'tsdb-%'"); - } - char *a; - char *b=(char *)"INSERT INTO runtime_global_variables(variable_name, variable_value) VALUES(\"tsdb-%s\",\"%s\")"; - if (replace) { - a=(char *)"REPLACE INTO global_variables(variable_name, variable_value) VALUES(\"tsdb-%s\",\"%s\")"; - } else { - a=(char *)"INSERT OR IGNORE INTO global_variables(variable_name, variable_value) VALUES(\"tsdb-%s\",\"%s\")"; - } - // TSDB variables are stored in GloProxyStats->variables - const char* tsdb_vars[] = { - "stats_tsdb_enabled", - "stats_tsdb_sample_interval", - "stats_tsdb_retention_days", - "stats_tsdb_monitor_enabled", - "stats_tsdb_monitor_interval", - NULL - }; - for (int i=0; tsdb_vars[i]; i++) { - char val[256]; - if (strcmp(tsdb_vars[i], "stats_tsdb_enabled") == 0) { - snprintf(val, sizeof(val), "%d", GloProxyStats->variables.stats_tsdb_enabled); - } else if (strcmp(tsdb_vars[i], "stats_tsdb_sample_interval") == 0) { - snprintf(val, sizeof(val), "%d", GloProxyStats->variables.stats_tsdb_sample_interval); - } else if (strcmp(tsdb_vars[i], "stats_tsdb_retention_days") == 0) { - snprintf(val, sizeof(val), "%d", GloProxyStats->variables.stats_tsdb_retention_days); - } else if (strcmp(tsdb_vars[i], "stats_tsdb_monitor_enabled") == 0) { - snprintf(val, sizeof(val), "%d", GloProxyStats->variables.stats_tsdb_monitor_enabled); - } else if (strcmp(tsdb_vars[i], "stats_tsdb_monitor_interval") == 0) { - snprintf(val, sizeof(val), "%d", GloProxyStats->variables.stats_tsdb_monitor_interval); - } else { - val[0] = '0'; - val[1] = '\0'; - } - size_t l = strlen(a) + 200; - l += strlen(tsdb_vars[i]); - l += strlen(val); - char *query=(char *)malloc(l); - snprintf(query, l, a, tsdb_vars[i], val); - db->execute(query); - if (runtime) { - snprintf(query, l, b, tsdb_vars[i], val); - db->execute(query); - } - free(query); - } -} - -void ProxySQL_Admin::flush_tsdb_variables___database_to_runtime(SQLite3DB *db, bool replace, const std::string& checksum, const time_t epoch) { - proxy_debug(PROXY_DEBUG_ADMIN, 4, "Flushing TSDB variables. Replace:%d\n", replace); - char *error=NULL; - int cols=0; - int affected_rows=0; - SQLite3_result *resultset=NULL; - char *q=(char *)"SELECT substr(variable_name,6) vn, variable_value FROM global_variables WHERE variable_name LIKE 'tsdb-%'"; - db->execute_statement(q, &error , &cols , &affected_rows , &resultset); - if (error) { - proxy_error("Error on %s : %s\n", q, error); - return; - } else { - for (std::vector::iterator it = resultset->rows.begin() ; it != resultset->rows.end(); ++it) { - SQLite3_row *r=*it; - const char *value = r->fields[1]; - int int_value = atoi(value); - if (strcmp(r->fields[0], "stats_tsdb_enabled") == 0) { - GloProxyStats->variables.stats_tsdb_enabled = int_value; - } else if (strcmp(r->fields[0], "stats_tsdb_sample_interval") == 0) { - GloProxyStats->variables.stats_tsdb_sample_interval = int_value; - } else if (strcmp(r->fields[0], "stats_tsdb_retention_days") == 0) { - GloProxyStats->variables.stats_tsdb_retention_days = int_value; - } else if (strcmp(r->fields[0], "stats_tsdb_monitor_enabled") == 0) { - GloProxyStats->variables.stats_tsdb_monitor_enabled = int_value; - } else if (strcmp(r->fields[0], "stats_tsdb_monitor_interval") == 0) { - GloProxyStats->variables.stats_tsdb_monitor_interval = int_value; - } - } - } - if (resultset) delete resultset; -} diff --git a/lib/ProxySQL_Admin.cpp b/lib/ProxySQL_Admin.cpp index d52096852..31b7b94fa 100644 --- a/lib/ProxySQL_Admin.cpp +++ b/lib/ProxySQL_Admin.cpp @@ -379,6 +379,11 @@ static char * admin_variables_names[]= { (char *)"stats_mysql_query_digest_to_disk", (char *)"stats_system_cpu", (char *)"stats_system_memory", + (char *)"stats_tsdb_enabled", + (char *)"stats_tsdb_sample_interval", + (char *)"stats_tsdb_retention_days", + (char *)"stats_tsdb_monitor_enabled", + (char *)"stats_tsdb_monitor_interval", (char *)"mysql_ifaces", (char *)"pgsql_ifaces", (char *)"telnet_admin_ifaces", @@ -2511,11 +2516,20 @@ __end_while_pool: GloProxyStats->system_cpu_sets(); } #ifndef NOJEM - if (GloProxyStats->system_memory_timetoget(curtime)) { - GloProxyStats->system_memory_sets(); - } + if (GloProxyStats->system_memory_timetoget(curtime)) { + GloProxyStats->system_memory_sets(); + } #endif - } + if (GloProxyStats->tsdb_sampler_timetoget(curtime)) { + GloProxyStats->tsdb_sampler_loop(); + } + if (GloProxyStats->tsdb_downsample_timetoget(curtime)) { + GloProxyStats->tsdb_downsample_metrics(); + } + if (GloProxyStats->tsdb_monitor_timetoget(curtime)) { + GloProxyStats->tsdb_monitor_loop(); + } + } if (S_amll.get_version()!=version) { S_amll.wrlock(); version=S_amll.get_version(); @@ -2804,6 +2818,11 @@ ProxySQL_Admin::ProxySQL_Admin() : variables.stats_mysql_eventslog_sync_buffer_to_disk = 0; variables.stats_system_cpu = 60; variables.stats_system_memory = 60; + variables.stats_tsdb_enabled = 0; + variables.stats_tsdb_sample_interval = 5; + variables.stats_tsdb_retention_days = 7; + variables.stats_tsdb_monitor_enabled = 0; + variables.stats_tsdb_monitor_interval = 10; GloProxyStats->variables.stats_mysql_connection_pool = 60; GloProxyStats->variables.stats_mysql_connections = 60; GloProxyStats->variables.stats_mysql_query_cache = 60; @@ -2813,6 +2832,11 @@ ProxySQL_Admin::ProxySQL_Admin() : #ifndef NOJEM GloProxyStats->variables.stats_system_memory = 60; #endif + GloProxyStats->variables.stats_tsdb_enabled = 0; + GloProxyStats->variables.stats_tsdb_sample_interval = 5; + GloProxyStats->variables.stats_tsdb_retention_days = 7; + GloProxyStats->variables.stats_tsdb_monitor_enabled = 0; + GloProxyStats->variables.stats_tsdb_monitor_interval = 10; variables.restapi_enabled = false; variables.restapi_enabled_old = false; @@ -3618,6 +3642,26 @@ char * ProxySQL_Admin::get_variable(char *name) { snprintf(intbuf, sizeof(intbuf),"%d",variables.stats_system_memory); return strdup(intbuf); } + if (!strcasecmp(name,"stats_tsdb_enabled")) { + snprintf(intbuf, sizeof(intbuf),"%d",variables.stats_tsdb_enabled); + return strdup(intbuf); + } + if (!strcasecmp(name,"stats_tsdb_sample_interval")) { + snprintf(intbuf, sizeof(intbuf),"%d",variables.stats_tsdb_sample_interval); + return strdup(intbuf); + } + if (!strcasecmp(name,"stats_tsdb_retention_days")) { + snprintf(intbuf, sizeof(intbuf),"%d",variables.stats_tsdb_retention_days); + return strdup(intbuf); + } + if (!strcasecmp(name,"stats_tsdb_monitor_enabled")) { + snprintf(intbuf, sizeof(intbuf),"%d",variables.stats_tsdb_monitor_enabled); + return strdup(intbuf); + } + if (!strcasecmp(name,"stats_tsdb_monitor_interval")) { + snprintf(intbuf, sizeof(intbuf),"%d",variables.stats_tsdb_monitor_interval); + return strdup(intbuf); + } } if (!strcasecmp(name,"admin_credentials")) return s_strdup(variables.admin_credentials); if (!strcasecmp(name,"mysql_ifaces")) return s_strdup(variables.mysql_ifaces); @@ -3964,19 +4008,64 @@ bool ProxySQL_Admin::set_variable(char *name, char *value, bool lock) { // this } } #ifndef NOJEM - if (!strcasecmp(name,"stats_system_memory")) { - int intv=atoi(value); - if (intv >= 0 && intv <= 600) { + if (!strcasecmp(name,"stats_system_memory")) { + int intv=atoi(value); + if (intv >= 0 && intv <= 600) { intv = round_intv_to_time_interval(name, intv); variables.stats_system_memory=intv; GloProxyStats->variables.stats_system_memory=intv; return true; } else { return false; + } } - } #endif - } + if (!strcasecmp(name, "stats_tsdb_enabled")) { + int intv = atoi(value); + if (intv == 0 || intv == 1) { + variables.stats_tsdb_enabled = intv; + GloProxyStats->variables.stats_tsdb_enabled = intv; + return true; + } + return false; + } + if (!strcasecmp(name, "stats_tsdb_sample_interval")) { + int intv = atoi(value); + if (intv >= 1 && intv <= 3600) { + variables.stats_tsdb_sample_interval = intv; + GloProxyStats->variables.stats_tsdb_sample_interval = intv; + return true; + } + return false; + } + if (!strcasecmp(name, "stats_tsdb_retention_days")) { + int intv = atoi(value); + if (intv >= 1 && intv <= 3650) { + variables.stats_tsdb_retention_days = intv; + GloProxyStats->variables.stats_tsdb_retention_days = intv; + return true; + } + return false; + } + if (!strcasecmp(name, "stats_tsdb_monitor_enabled")) { + int intv = atoi(value); + if (intv == 0 || intv == 1) { + variables.stats_tsdb_monitor_enabled = intv; + GloProxyStats->variables.stats_tsdb_monitor_enabled = intv; + return true; + } + return false; + } + if (!strcasecmp(name, "stats_tsdb_monitor_interval")) { + int intv = atoi(value); + if (intv >= 1 && intv <= 3600) { + variables.stats_tsdb_monitor_interval = intv; + GloProxyStats->variables.stats_tsdb_monitor_interval = intv; + return true; + } + return false; + } + } if (!strcasecmp(name,"mysql_ifaces")) { if (vallen) { bool update_creds=false; diff --git a/lib/ProxySQL_Statistics.cpp b/lib/ProxySQL_Statistics.cpp index 12c31e41c..12d859826 100644 --- a/lib/ProxySQL_Statistics.cpp +++ b/lib/ProxySQL_Statistics.cpp @@ -12,6 +12,53 @@ using json = nlohmann::json; #include #include +#include +#include +#include +#include +#include +#include + +namespace { +std::string escape_sql_string_literal(const std::string& value) { + std::string escaped; + escaped.reserve(value.size() + 8); + for (const char c : value) { + if (c == '\'') { + escaped += "''"; + } else { + escaped += c; + } + } + return escaped; +} + +bool valid_label_key(const std::string& key) { + if (key.empty()) { + return false; + } + for (const unsigned char c : key) { + if (!(std::isalnum(c) || c == '_' || c == '-' || c == ':')) { + return false; + } + } + return true; +} + +std::string format_prometheus_label_double(const double value) { + if (std::isnan(value)) { + return "nan"; + } + if (std::isinf(value)) { + return value > 0 ? "+Inf" : "-Inf"; + } + std::ostringstream oss; + oss.setf(std::ios::fmtflags(0), std::ios::floatfield); + oss.precision(17); + oss << value; + return oss.str(); +} +} // namespace #ifdef DEBUG #define DEB "_DEBUG" @@ -55,6 +102,15 @@ ProxySQL_Statistics::ProxySQL_Statistics() { next_timer_system_memory = 0; #endif next_timer_MySQL_Query_Cache = 0; + next_timer_tsdb_sampler = 0; + next_timer_tsdb_downsample = 0; + next_timer_tsdb_monitor = 0; + + variables.stats_tsdb_enabled = 0; + variables.stats_tsdb_sample_interval = 5; + variables.stats_tsdb_retention_days = 7; + variables.stats_tsdb_monitor_enabled = 0; + variables.stats_tsdb_monitor_interval = 10; } ProxySQL_Statistics::~ProxySQL_Statistics() { @@ -131,6 +187,10 @@ void ProxySQL_Statistics::init() { statsdb_disk->execute("CREATE INDEX IF NOT EXISTS idx_history_mysql_query_events_start_time ON history_mysql_query_events(start_time)"); statsdb_disk->execute("CREATE INDEX IF NOT EXISTS idx_history_mysql_query_events_query_digest ON history_mysql_query_events(query_digest)"); + statsdb_disk->execute("CREATE INDEX IF NOT EXISTS idx_tsdb_metrics_metric_time ON tsdb_metrics(metric_name, timestamp)"); + statsdb_disk->execute("CREATE INDEX IF NOT EXISTS idx_tsdb_metrics_hour_metric_bucket ON tsdb_metrics_hour(metric_name, bucket)"); + statsdb_disk->execute("CREATE INDEX IF NOT EXISTS idx_tsdb_backend_health_time ON tsdb_backend_health(timestamp)"); + statsdb_disk->execute("CREATE INDEX IF NOT EXISTS idx_tsdb_backend_health_host_time ON tsdb_backend_health(hostgroup, hostname, port, timestamp)"); } void ProxySQL_Statistics::disk_upgrade_mysql_connections() { @@ -156,6 +216,40 @@ void ProxySQL_Statistics::disk_upgrade_mysql_connections() { statsdb_disk->execute("ALTER TABLE mysql_connections_day ADD COLUMN GTID_consistent_queries INT NOT NULL DEFAULT 0"); proxy_warning("ONLINE UPGRADE of table mysql_connections_day completed\n"); } + + const char* tsdb_metrics_old = + "CREATE TABLE tsdb_metrics (timestamp INT NOT NULL, metric_name TEXT NOT NULL, labels TEXT, value REAL, PRIMARY KEY (timestamp, metric_name)) WITHOUT ROWID"; + rci = statsdb_disk->check_table_structure((char*)"tsdb_metrics", (char*)tsdb_metrics_old); + if (rci) { + proxy_warning("Detected legacy schema for tsdb_metrics\n"); + statsdb_disk->execute("BEGIN IMMEDIATE"); + statsdb_disk->execute("ALTER TABLE tsdb_metrics RENAME TO tsdb_metrics_old"); + statsdb_disk->execute(STATSDB_SQLITE_TABLE_TSDB_METRICS); + statsdb_disk->execute( + "INSERT OR IGNORE INTO tsdb_metrics(timestamp, metric_name, labels, value) " + "SELECT timestamp, metric_name, COALESCE(labels,'{}'), value FROM tsdb_metrics_old" + ); + statsdb_disk->execute("DROP TABLE tsdb_metrics_old"); + statsdb_disk->execute("COMMIT"); + proxy_warning("ONLINE UPGRADE of table tsdb_metrics completed\n"); + } + + const char* tsdb_metrics_hour_old = + "CREATE TABLE tsdb_metrics_hour (bucket INT NOT NULL, metric_name TEXT NOT NULL, labels TEXT, avg_value REAL, max_value REAL, min_value REAL, count INT, PRIMARY KEY (bucket, metric_name)) WITHOUT ROWID"; + rci = statsdb_disk->check_table_structure((char*)"tsdb_metrics_hour", (char*)tsdb_metrics_hour_old); + if (rci) { + proxy_warning("Detected legacy schema for tsdb_metrics_hour\n"); + statsdb_disk->execute("BEGIN IMMEDIATE"); + statsdb_disk->execute("ALTER TABLE tsdb_metrics_hour RENAME TO tsdb_metrics_hour_old"); + statsdb_disk->execute(STATSDB_SQLITE_TABLE_TSDB_METRICS_HOUR); + statsdb_disk->execute( + "INSERT OR IGNORE INTO tsdb_metrics_hour(bucket, metric_name, labels, avg_value, max_value, min_value, count) " + "SELECT bucket, metric_name, COALESCE(labels,'{}'), avg_value, max_value, min_value, count FROM tsdb_metrics_hour_old" + ); + statsdb_disk->execute("DROP TABLE tsdb_metrics_hour_old"); + statsdb_disk->execute("COMMIT"); + proxy_warning("ONLINE UPGRADE of table tsdb_metrics_hour completed\n"); + } } void ProxySQL_Statistics::print_version() { @@ -1217,7 +1311,7 @@ void ProxySQL_Statistics::insert_tsdb_metric(const std::string& metric_name, sqlite3 *mydb3 = statsdb_disk->get_db(); sqlite3_stmt *statement = NULL; - const char* query = "INSERT INTO tsdb_metrics VALUES (?1, ?2, ?3, ?4)"; + const char* query = "INSERT OR REPLACE INTO tsdb_metrics(timestamp, metric_name, labels, value) VALUES (?1, ?2, ?3, ?4)"; int rc = (*proxy_sqlite3_prepare_v2)(mydb3, query, -1, &statement, 0); if (rc != SQLITE_OK) { proxy_error("Failed to prepare statement: %s\n", sqlite3_errmsg(mydb3)); @@ -1227,9 +1321,12 @@ void ProxySQL_Statistics::insert_tsdb_metric(const std::string& metric_name, // Convert labels map to JSON string json j_labels(labels); std::string labels_str = j_labels.dump(); + if (labels_str.empty()) { + labels_str = "{}"; + } rc = (*proxy_sqlite3_bind_int64)(statement, 1, timestamp); - rc = (*proxy_sqlite3_bind_text)(statement, 2, metric_name.c_str(), -1, SQLITE_STATIC); + rc = (*proxy_sqlite3_bind_text)(statement, 2, metric_name.c_str(), -1, SQLITE_TRANSIENT); rc = (*proxy_sqlite3_bind_text)(statement, 3, labels_str.c_str(), -1, SQLITE_TRANSIENT); rc = (*proxy_sqlite3_bind_double)(statement, 4, value); @@ -1248,7 +1345,7 @@ void ProxySQL_Statistics::insert_backend_health(int hostgroup, sqlite3 *mydb3 = statsdb_disk->get_db(); sqlite3_stmt *statement = NULL; - const char* query = "INSERT INTO tsdb_backend_health VALUES (?1, ?2, ?3, ?4, ?5, ?6)"; + const char* query = "INSERT OR REPLACE INTO tsdb_backend_health(timestamp, hostgroup, hostname, port, probe_up, connect_ms) VALUES (?1, ?2, ?3, ?4, ?5, ?6)"; int rc = (*proxy_sqlite3_prepare_v2)(mydb3, query, -1, &statement, 0); if (rc != SQLITE_OK) { proxy_error("Failed to prepare statement: %s\n", sqlite3_errmsg(mydb3)); @@ -1257,7 +1354,7 @@ void ProxySQL_Statistics::insert_backend_health(int hostgroup, rc = (*proxy_sqlite3_bind_int64)(statement, 1, timestamp); rc = (*proxy_sqlite3_bind_int)(statement, 2, hostgroup); - rc = (*proxy_sqlite3_bind_text)(statement, 3, hostname.c_str(), -1, SQLITE_STATIC); + rc = (*proxy_sqlite3_bind_text)(statement, 3, hostname.c_str(), -1, SQLITE_TRANSIENT); rc = (*proxy_sqlite3_bind_int)(statement, 4, port); rc = (*proxy_sqlite3_bind_int)(statement, 5, probe_up ? 1 : 0); rc = (*proxy_sqlite3_bind_int)(statement, 6, connect_ms); @@ -1268,6 +1365,7 @@ void ProxySQL_Statistics::insert_backend_health(int hostgroup, // TSDB Downsampling void ProxySQL_Statistics::tsdb_downsample_metrics() { + if (!variables.stats_tsdb_enabled) return; if (!statsdb_disk) return; time_t ts = time(NULL); @@ -1308,11 +1406,12 @@ void ProxySQL_Statistics::tsdb_downsample_metrics() { statsdb_disk->execute(buf); } - // Retention: delete raw data older than 7 days + const int retention_days = std::max(1, variables.stats_tsdb_retention_days); + // Retention: delete raw data older than configured days char delete_buf[256]; snprintf(delete_buf, sizeof(delete_buf), "DELETE FROM tsdb_metrics WHERE timestamp < %ld", - ts - 86400 * 7); + ts - 86400 * retention_days); statsdb_disk->execute(delete_buf); // Retention: delete hourly data older than 1 year @@ -1320,6 +1419,12 @@ void ProxySQL_Statistics::tsdb_downsample_metrics() { "DELETE FROM tsdb_metrics_hour WHERE bucket < %ld", ts - 86400 * 365); statsdb_disk->execute(delete_buf); + + // Retention: delete backend probe data older than configured days + snprintf(delete_buf, sizeof(delete_buf), + "DELETE FROM tsdb_backend_health WHERE timestamp < %ld", + ts - 86400 * retention_days); + statsdb_disk->execute(delete_buf); } // TSDB Status @@ -1335,7 +1440,7 @@ ProxySQL_Statistics::tsdb_status_t ProxySQL_Statistics::get_tsdb_status() { // Count total series (unique metric_name + labels combinations) statsdb_disk->execute_statement( - "SELECT COUNT(DISTINCT metric_name || labels) FROM tsdb_metrics", + "SELECT COUNT(DISTINCT metric_name || CHAR(31) || labels) FROM tsdb_metrics", &error, &cols, &affected_rows, &resultset); if (resultset && resultset->rows_count > 0) { status.total_series = atol(resultset->rows[0]->fields[0]); @@ -1367,11 +1472,20 @@ ProxySQL_Statistics::tsdb_status_t ProxySQL_Statistics::get_tsdb_status() { delete resultset; } + long long page_count = statsdb_disk->return_one_int((char*)"PRAGMA page_count"); + long long page_size = statsdb_disk->return_one_int((char*)"PRAGMA page_size"); + if (page_count > 0 && page_size > 0) { + status.disk_size_bytes = page_count * page_size; + } + return status; } // TSDB Timer Checks bool ProxySQL_Statistics::tsdb_sampler_timetoget(unsigned long long curtime) { + if (!variables.stats_tsdb_enabled || variables.stats_tsdb_sample_interval <= 0) { + return false; + } if (curtime > next_timer_tsdb_sampler) { next_timer_tsdb_sampler = curtime + variables.stats_tsdb_sample_interval * 1000000; return true; @@ -1380,14 +1494,20 @@ bool ProxySQL_Statistics::tsdb_sampler_timetoget(unsigned long long curtime) { } bool ProxySQL_Statistics::tsdb_downsample_timetoget(unsigned long long curtime) { + if (!variables.stats_tsdb_enabled) { + return false; + } if (curtime > next_timer_tsdb_downsample) { - next_timer_tsdb_downsample = curtime + 3600 * 1000000; // Hourly + next_timer_tsdb_downsample = curtime + 3600ULL * 1000000ULL; // Hourly return true; } return false; } bool ProxySQL_Statistics::tsdb_monitor_timetoget(unsigned long long curtime) { + if (!variables.stats_tsdb_enabled || !variables.stats_tsdb_monitor_enabled || variables.stats_tsdb_monitor_interval <= 0) { + return false; + } if (curtime > next_timer_tsdb_monitor) { next_timer_tsdb_monitor = curtime + variables.stats_tsdb_monitor_interval * 1000000; return true; @@ -1404,86 +1524,90 @@ SQLite3_result* ProxySQL_Statistics::query_tsdb_metrics( const std::string& aggregation) { if (!statsdb_disk) return NULL; + if (to < from) { + std::swap(from, to); + } + const bool use_hourly = (to - from > 86400); + const std::string agg = aggregation.empty() ? "raw" : aggregation; std::string query; - // Choose table based on time range - if (to - from > 86400) { // > 1 day: use hourly table - query = "SELECT bucket, metric_name, labels, avg_value, max_value, min_value, count " - "FROM tsdb_metrics_hour WHERE metric_name = ?1 AND bucket BETWEEN ?2 AND ?3"; + if (use_hourly) { + std::string value_expr = "avg_value"; + if (agg == "max") { + value_expr = "max_value"; + } else if (agg == "min") { + value_expr = "min_value"; + } else if (agg == "count") { + value_expr = "count"; + } + query = + "SELECT bucket AS ts, metric_name, labels, " + value_expr + " AS value " + "FROM tsdb_metrics_hour " + "WHERE metric_name='" + escape_sql_string_literal(metric_name) + "' " + "AND bucket BETWEEN " + std::to_string(from) + " AND " + std::to_string(to); } else { - query = "SELECT timestamp, metric_name, labels, value, NULL, NULL, 1 " - "FROM tsdb_metrics WHERE metric_name = ?1 AND timestamp BETWEEN ?2 AND ?3"; + query = + "SELECT timestamp AS ts, metric_name, labels, value " + "FROM tsdb_metrics " + "WHERE metric_name='" + escape_sql_string_literal(metric_name) + "' " + "AND timestamp BETWEEN " + std::to_string(from) + " AND " + std::to_string(to); } - // Add label filters using JSON_EXTRACT - int param_idx = 4; for (const auto& kv : label_filters) { - query += " AND json_extract(labels, '$." + kv.first + "') = ?" + std::to_string(param_idx++); + if (!valid_label_key(kv.first)) { + proxy_error("Invalid TSDB label key: %s\n", kv.first.c_str()); + return NULL; + } + query += " AND json_extract(labels, '$.\"" + kv.first + "\"')='" + escape_sql_string_literal(kv.second) + "'"; } - query += " ORDER BY " + std::string(to - from > 86400 ? "bucket" : "timestamp"); + query += std::string(" ORDER BY ") + (use_hourly ? "bucket" : "timestamp"); - // Prepare and execute - sqlite3_stmt *statement = NULL; - sqlite3 *mydb3 = statsdb_disk->get_db(); - int rc = (*proxy_sqlite3_prepare_v2)(mydb3, query.c_str(), -1, &statement, 0); - if (rc != SQLITE_OK) { - proxy_error("Failed to prepare statement: %s\n", sqlite3_errmsg(mydb3)); + char* error = NULL; + int cols = 0; + int affected_rows = 0; + SQLite3_result* resultset = NULL; + statsdb_disk->execute_statement((char*)query.c_str(), &error, &cols, &affected_rows, &resultset); + if (error) { + proxy_error("query_tsdb_metrics failed: %s -- sql: %s\n", error, query.c_str()); + if (resultset) { + delete resultset; + } return NULL; } - - // Bind parameters - rc = (*proxy_sqlite3_bind_text)(statement, 1, metric_name.c_str(), -1, SQLITE_STATIC); - rc = (*proxy_sqlite3_bind_int64)(statement, 2, from); - rc = (*proxy_sqlite3_bind_int64)(statement, 3, to); - - param_idx = 4; - for (const auto& kv : label_filters) { - rc = (*proxy_sqlite3_bind_text)(statement, param_idx++, kv.second.c_str(), -1, SQLITE_TRANSIENT); - } - - // Execute statement - SAFE_SQLITE3_STEP2(statement); - (*proxy_sqlite3_finalize)(statement); - - // Return empty resultset - caller will query for actual data - return NULL; + return resultset; } // TSDB Backend Health Query SQLite3_result* ProxySQL_Statistics::get_backend_health_metrics(time_t from, time_t to, int hostgroup) { if (!statsdb_disk) return NULL; + if (to < from) { + std::swap(from, to); + } - std::string query = "SELECT timestamp, hostgroup, hostname, port, probe_up, connect_ms " - "FROM tsdb_backend_health WHERE timestamp BETWEEN ?1 AND ?2"; - + std::string query = + "SELECT timestamp, hostgroup, hostname, port, probe_up, connect_ms " + "FROM tsdb_backend_health " + "WHERE timestamp BETWEEN " + std::to_string(from) + " AND " + std::to_string(to); if (hostgroup >= 0) { - query += " AND hostgroup = ?3"; + query += " AND hostgroup = " + std::to_string(hostgroup); } - query += " ORDER BY timestamp"; - sqlite3_stmt *statement = NULL; - sqlite3 *mydb3 = statsdb_disk->get_db(); - int rc = (*proxy_sqlite3_prepare_v2)(mydb3, query.c_str(), -1, &statement, 0); - if (rc != SQLITE_OK) { - proxy_error("Failed to prepare statement: %s\n", sqlite3_errmsg(mydb3)); + char* error = NULL; + int cols = 0; + int affected_rows = 0; + SQLite3_result* resultset = NULL; + statsdb_disk->execute_statement((char*)query.c_str(), &error, &cols, &affected_rows, &resultset); + if (error) { + proxy_error("get_backend_health_metrics failed: %s -- sql: %s\n", error, query.c_str()); + if (resultset) { + delete resultset; + } return NULL; } - - rc = (*proxy_sqlite3_bind_int64)(statement, 1, from); - rc = (*proxy_sqlite3_bind_int64)(statement, 2, to); - if (hostgroup >= 0) { - rc = (*proxy_sqlite3_bind_int)(statement, 3, hostgroup); - } - - // Execute statement - SAFE_SQLITE3_STEP2(statement); - (*proxy_sqlite3_finalize)(statement); - - // Return empty resultset - caller will query for actual data - return NULL; + return resultset; } // TSDB Sampler Loop @@ -1500,19 +1624,50 @@ void ProxySQL_Statistics::tsdb_sampler_loop() { for (const auto& lp : metric.label) { labels[lp.name] = lp.value; } - double val = 0.0; - if (metric.counter.value > 0) { - val = metric.counter.value; - } else if (metric.gauge.value > 0) { - val = metric.gauge.value; + switch (family.type) { + case prometheus::MetricType::Counter: + insert_tsdb_metric(family.name, labels, metric.counter.value, now); + break; + case prometheus::MetricType::Gauge: + insert_tsdb_metric(family.name, labels, metric.gauge.value, now); + break; + case prometheus::MetricType::Summary: { + insert_tsdb_metric(family.name + "_count", labels, static_cast(metric.summary.sample_count), now); + insert_tsdb_metric(family.name + "_sum", labels, metric.summary.sample_sum, now); + for (const auto& q : metric.summary.quantile) { + std::map q_labels(labels); + q_labels["quantile"] = format_prometheus_label_double(q.quantile); + insert_tsdb_metric(family.name, q_labels, q.value, now); + } + break; + } + case prometheus::MetricType::Histogram: { + insert_tsdb_metric(family.name + "_count", labels, static_cast(metric.histogram.sample_count), now); + insert_tsdb_metric(family.name + "_sum", labels, metric.histogram.sample_sum, now); + for (const auto& b : metric.histogram.bucket) { + std::map b_labels(labels); + b_labels["le"] = format_prometheus_label_double(b.upper_bound); + insert_tsdb_metric( + family.name + "_bucket", + b_labels, + static_cast(b.cumulative_count), + now + ); + } + break; + } + case prometheus::MetricType::Info: + insert_tsdb_metric(family.name, labels, metric.info.value, now); + break; + case prometheus::MetricType::Untyped: + default: + insert_tsdb_metric(family.name, labels, metric.untyped.value, now); + break; } - insert_tsdb_metric(family.name, labels, val, now); } } } - // Downsample if needed - tsdb_downsample_metrics(); } // TSDB Monitor Loop @@ -1524,46 +1679,60 @@ void ProxySQL_Statistics::tsdb_monitor_loop() { char *err_msg = NULL; int cols = 0; - int rows = 0; + int affected_rows = 0; SQLite3_result *resultset = NULL; GloAdmin->admindb->execute_statement( "SELECT hostgroup_id, hostname, port FROM runtime_mysql_servers", - &err_msg, &cols, &rows, &resultset); + &err_msg, &cols, &affected_rows, &resultset); if (resultset) { time_t now = time(NULL); - for (int i = 0; i < rows; i++) { + for (int i = 0; i < static_cast(resultset->rows_count); i++) { int hg = atoi(resultset->rows[i]->fields[0]); const char* host = resultset->rows[i]->fields[1]; + const std::string host_s = (host ? host : ""); int port = atoi(resultset->rows[i]->fields[2]); - // TCP probe - int sock = socket(AF_INET, SOCK_STREAM, 0); bool probe_up = false; int connect_ms = -1; - if (sock >= 0) { - struct sockaddr_in addr; - memset(&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - addr.sin_port = htons(port); - if (inet_pton(AF_INET, host, &addr.sin_addr) > 0) { - struct timeval tv; - tv.tv_sec = 1; - tv.tv_usec = 0; - setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); - setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)); - - auto start = std::chrono::steady_clock::now(); - int res = connect(sock, (struct sockaddr *)&addr, sizeof(addr)); - auto end = std::chrono::steady_clock::now(); - connect_ms = std::chrono::duration_cast(end - start).count(); - probe_up = (res == 0); + if (!host_s.empty()) { + struct addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_socktype = SOCK_STREAM; + hints.ai_family = AF_UNSPEC; + + struct addrinfo* res = NULL; + char port_str[16]; + snprintf(port_str, sizeof(port_str), "%d", port); + + if (getaddrinfo(host_s.c_str(), port_str, &hints, &res) == 0 && res) { + for (struct addrinfo* ai = res; ai != NULL; ai = ai->ai_next) { + int sock = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol); + if (sock < 0) { + continue; + } + struct timeval tv; + tv.tv_sec = 1; + tv.tv_usec = 0; + setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); + setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, &tv, sizeof(tv)); + + auto start = std::chrono::steady_clock::now(); + int cres = connect(sock, ai->ai_addr, ai->ai_addrlen); + auto end = std::chrono::steady_clock::now(); + connect_ms = std::chrono::duration_cast(end - start).count(); + close(sock); + if (cres == 0) { + probe_up = true; + break; + } + } + freeaddrinfo(res); } - close(sock); } - insert_backend_health(hg, host, port, probe_up, connect_ms, now); + insert_backend_health(hg, host_s, port, probe_up, connect_ms, now); } delete resultset; } diff --git a/test/tap/tests/test_tsdb_admin_variables-t.cpp b/test/tap/tests/test_tsdb_admin_variables-t.cpp new file mode 100644 index 000000000..e0ef51ea2 --- /dev/null +++ b/test/tap/tests/test_tsdb_admin_variables-t.cpp @@ -0,0 +1,123 @@ +#include +#include +#include + +#include "mysql.h" + +#include "tap.h" +#include "command_line.h" +#include "utils.h" + +using std::map; +using std::string; + +static bool fetch_single_string(MYSQL* mysql, const string& query, string& out) { + if (mysql_query(mysql, query.c_str())) { + return false; + } + MYSQL_RES* res = mysql_store_result(mysql); + if (!res) { + return false; + } + MYSQL_ROW row = mysql_fetch_row(res); + if (!row || !row[0]) { + mysql_free_result(res); + return false; + } + out = row[0]; + mysql_free_result(res); + return true; +} + +int main() { + CommandLine cl; + if (cl.getEnv()) { + diag("Failed to get the required environmental variables."); + return EXIT_FAILURE; + } + + plan(12); + + MYSQL* admin = mysql_init(NULL); + if (!admin) { + return EXIT_FAILURE; + } + if (!mysql_real_connect(admin, cl.host, cl.admin_username, cl.admin_password, NULL, cl.admin_port, NULL, 0)) { + mysql_close(admin); + return EXIT_FAILURE; + } + + int rc = mysql_query(admin, "LOAD TSDB VARIABLES TO RUNTIME"); + ok(rc != 0, "`LOAD TSDB VARIABLES TO RUNTIME` is rejected"); + + rc = mysql_query(admin, "SAVE TSDB VARIABLES TO DISK"); + ok(rc != 0, "`SAVE TSDB VARIABLES TO DISK` is rejected"); + + MYSQL_QUERY_T(admin, "SET admin-stats_tsdb_enabled='1'"); + MYSQL_QUERY_T(admin, "SET admin-stats_tsdb_sample_interval='11'"); + MYSQL_QUERY_T(admin, "SET admin-stats_tsdb_retention_days='30'"); + MYSQL_QUERY_T(admin, "SET admin-stats_tsdb_monitor_enabled='1'"); + MYSQL_QUERY_T(admin, "SET admin-stats_tsdb_monitor_interval='13'"); + MYSQL_QUERY_T(admin, "LOAD ADMIN VARIABLES TO RUNTIME"); + + string count; + bool count_ok = fetch_single_string( + admin, + "SELECT COUNT(*) FROM runtime_global_variables WHERE variable_name LIKE 'admin-stats_tsdb_%'", + count + ); + ok(count_ok, "Read runtime TSDB variable count from runtime_global_variables"); + ok(count == "5", "Exactly five admin-stats_tsdb runtime variables are present"); + + const map expected_runtime_values{ + {"admin-stats_tsdb_enabled", "1"}, + {"admin-stats_tsdb_sample_interval", "11"}, + {"admin-stats_tsdb_retention_days", "30"}, + {"admin-stats_tsdb_monitor_enabled", "1"}, + {"admin-stats_tsdb_monitor_interval", "13"}, + }; + + for (const auto& kv : expected_runtime_values) { + string value; + bool ok_fetch = fetch_single_string( + admin, + "SELECT variable_value FROM runtime_global_variables WHERE variable_name='" + kv.first + "'", + value + ); + ok(ok_fetch && value == kv.second, "Runtime value matches for %s", kv.first.c_str()); + } + + MYSQL_QUERY_T(admin, "SAVE ADMIN VARIABLES TO DISK"); + string disk_enabled; + bool disk_ok = fetch_single_string( + admin, + "SELECT variable_value FROM global_variables WHERE variable_name='admin-stats_tsdb_enabled'", + disk_enabled + ); + ok(disk_ok && disk_enabled == "1", "TSDB admin variable is persisted to disk via SAVE ADMIN VARIABLES"); + + string metrics_schema; + bool schema_metrics_ok = fetch_single_string( + admin, + "SELECT sql FROM statsdb_disk.sqlite_master WHERE type='table' AND name='tsdb_metrics'", + metrics_schema + ); + ok( + schema_metrics_ok && metrics_schema.find("PRIMARY KEY (timestamp, metric_name, labels)") != string::npos, + "tsdb_metrics schema uses labels in primary key" + ); + + string metrics_hour_schema; + bool schema_hour_ok = fetch_single_string( + admin, + "SELECT sql FROM statsdb_disk.sqlite_master WHERE type='table' AND name='tsdb_metrics_hour'", + metrics_hour_schema + ); + ok( + schema_hour_ok && metrics_hour_schema.find("PRIMARY KEY (bucket, metric_name, labels)") != string::npos, + "tsdb_metrics_hour schema uses labels in primary key" + ); + + mysql_close(admin); + return exit_status(); +}