mirror of https://github.com/sysown/proxysql
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
407 lines
18 KiB
407 lines
18 KiB
/**
|
|
* @file pgsql-test_dns_cache-t.cpp
|
|
* @brief Tests the PgSQL-side DNS cache added for issue #5768.
|
|
*
|
|
* Covers:
|
|
* - Cache is on by default and a resolvable hostname populates it
|
|
* (PgSQL_Monitor_dns_cache_record_updated increments).
|
|
* - IP literals bypass the cache (record_updated stays flat).
|
|
* - Client connect through ProxySQL goes through the cache
|
|
* (PgSQL_Monitor_dns_cache_queried + _lookup_success increment).
|
|
* - Hostnames with leading/trailing whitespace are trimmed before
|
|
* resolution.
|
|
* - An unresolvable hostname is queried but never produces a record.
|
|
* - Removing a pgsql_servers row drops the corresponding cache record.
|
|
* - Disabling the cache (refresh_interval=0) flatlines all counters.
|
|
* - PgSQL cache state is independent of MySQL cache state: tweaking
|
|
* pgsql-monitor_local_dns_* and inserting pgsql hostnames must not
|
|
* move the MySQL_Monitor_dns_cache_* counters, and vice versa.
|
|
*
|
|
* Reads counters from stats_pgsql_global / stats_mysql_global rather
|
|
* than Prometheus because the PgSQL DNS counters are only exposed
|
|
* through stats_pgsql_global today.
|
|
*/
|
|
|
|
#include <chrono>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <memory>
|
|
#include <sstream>
|
|
#include <string>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
#include "libpq-fe.h"
|
|
|
|
#include "command_line.h"
|
|
#include "tap.h"
|
|
|
|
CommandLine cl;
|
|
|
|
using PGConnPtr = std::unique_ptr<PGconn, decltype(&PQfinish)>;
|
|
|
|
static PGConnPtr admin_connect() {
|
|
std::stringstream ss;
|
|
ss << "host=" << cl.pgsql_admin_host
|
|
<< " port=" << cl.pgsql_admin_port
|
|
<< " user=" << cl.admin_username
|
|
<< " password=" << cl.admin_password
|
|
<< " sslmode=disable";
|
|
PGconn* c = PQconnectdb(ss.str().c_str());
|
|
if (PQstatus(c) != CONNECTION_OK) {
|
|
diag("ADMIN connect failed: %s", PQerrorMessage(c));
|
|
PQfinish(c);
|
|
return PGConnPtr(nullptr, &PQfinish);
|
|
}
|
|
return PGConnPtr(c, &PQfinish);
|
|
}
|
|
|
|
static PGConnPtr backend_connect() {
|
|
std::stringstream ss;
|
|
ss << "host=" << cl.pgsql_host
|
|
<< " port=" << cl.pgsql_port
|
|
<< " user=" << cl.pgsql_username
|
|
<< " password=" << cl.pgsql_password
|
|
<< " sslmode=disable";
|
|
PGconn* c = PQconnectdb(ss.str().c_str());
|
|
// Don't fail the test on connection error; the test exercises the DNS
|
|
// cache via connection attempts that may legitimately fail (bad host,
|
|
// bad port). Caller checks PQstatus.
|
|
return PGConnPtr(c, &PQfinish);
|
|
}
|
|
|
|
// Run a one-off statement on the admin connection. Returns true on success.
|
|
// Statements where we don't care about the row count use this.
|
|
static bool admin_exec(PGConnPtr& a, const std::string& sql) {
|
|
PGresult* r = PQexec(a.get(), sql.c_str());
|
|
const ExecStatusType st = PQresultStatus(r);
|
|
bool ok = (st == PGRES_COMMAND_OK || st == PGRES_TUPLES_OK);
|
|
if (!ok) {
|
|
diag("admin_exec('%s') failed: %s", sql.c_str(), PQerrorMessage(a.get()));
|
|
}
|
|
PQclear(r);
|
|
return ok;
|
|
}
|
|
|
|
// Read a single counter from stats_pgsql_global (or stats_mysql_global).
|
|
// Returns -1 on failure.
|
|
static long admin_counter(PGConnPtr& a, const char* table, const char* name) {
|
|
std::stringstream q;
|
|
q << "SELECT Variable_Value FROM " << table
|
|
<< " WHERE Variable_Name='" << name << "'";
|
|
PGresult* r = PQexec(a.get(), q.str().c_str());
|
|
if (PQresultStatus(r) != PGRES_TUPLES_OK || PQntuples(r) == 0) {
|
|
PQclear(r);
|
|
return -1;
|
|
}
|
|
long v = atol(PQgetvalue(r, 0, 0));
|
|
PQclear(r);
|
|
return v;
|
|
}
|
|
|
|
struct DnsCounters {
|
|
long queried { -1 };
|
|
long lookup_success { -1 };
|
|
long record_updated { -1 };
|
|
};
|
|
|
|
static DnsCounters read_pg_counters(PGConnPtr& a) {
|
|
DnsCounters c;
|
|
c.queried = admin_counter(a, "stats_pgsql_global", "PgSQL_Monitor_dns_cache_queried");
|
|
c.lookup_success = admin_counter(a, "stats_pgsql_global", "PgSQL_Monitor_dns_cache_lookup_success");
|
|
c.record_updated = admin_counter(a, "stats_pgsql_global", "PgSQL_Monitor_dns_cache_record_updated");
|
|
return c;
|
|
}
|
|
|
|
// Force a `SELECT 1` round-trip through the proxy a few times so the worker
|
|
// thread opens a backend connection and the DNS cache lookup path runs.
|
|
// Returns the number of successful attempts; we don't require all of them
|
|
// because a hostname pointing at a dummy port is expected to fail at TCP
|
|
// connect — that's fine, the DNS lookup happened first.
|
|
static int hammer_proxy(int n) {
|
|
int ok_count = 0;
|
|
for (int i = 0; i < n; i++) {
|
|
PGConnPtr c = backend_connect();
|
|
if (PQstatus(c.get()) == CONNECTION_OK) {
|
|
PGresult* r = PQexec(c.get(), "SELECT 1");
|
|
if (PQresultStatus(r) == PGRES_TUPLES_OK) ok_count++;
|
|
PQclear(r);
|
|
}
|
|
// Brief pacing so we don't outpace the resolver loop.
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(50));
|
|
}
|
|
return ok_count;
|
|
}
|
|
|
|
static void sleep_seconds(int s) {
|
|
std::this_thread::sleep_for(std::chrono::seconds(s));
|
|
}
|
|
|
|
// Wait up to deadline_secs for the resolver loop to populate the cache; poll
|
|
// every poll_ms. Returns true if record_updated grew past `baseline`.
|
|
static bool wait_for_record_growth(PGConnPtr& a, long baseline, int deadline_secs) {
|
|
const auto deadline = std::chrono::steady_clock::now() + std::chrono::seconds(deadline_secs);
|
|
while (std::chrono::steady_clock::now() < deadline) {
|
|
const long now_val = admin_counter(a, "stats_pgsql_global", "PgSQL_Monitor_dns_cache_record_updated");
|
|
if (now_val > baseline) return true;
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(250));
|
|
}
|
|
return false;
|
|
}
|
|
|
|
int main(int /*argc*/, char** /*argv*/) {
|
|
if (cl.getEnv()) {
|
|
diag("Failed to get the required environmental variables.");
|
|
return -1;
|
|
}
|
|
|
|
plan(17);
|
|
|
|
PGConnPtr admin = admin_connect();
|
|
if (!admin) {
|
|
BAIL_OUT("Could not connect to ProxySQL Admin via libpq");
|
|
return exit_status();
|
|
}
|
|
|
|
// Make the cache tunable and fast, so test cycles complete in seconds.
|
|
// 500 ms refresh interval, 5 s TTL. Save the originals so we restore
|
|
// them on the way out.
|
|
const long orig_refresh = admin_counter(admin, "global_variables", "pgsql-monitor_local_dns_cache_refresh_interval");
|
|
const long orig_ttl = admin_counter(admin, "global_variables", "pgsql-monitor_local_dns_cache_ttl");
|
|
const long orig_qsize = admin_counter(admin, "global_variables", "pgsql-monitor_local_dns_resolver_queue_maxsize");
|
|
|
|
if (!admin_exec(admin, "SET pgsql-monitor_local_dns_cache_refresh_interval=500")) {
|
|
BAIL_OUT("Could not set pgsql-monitor_local_dns_cache_refresh_interval");
|
|
return exit_status();
|
|
}
|
|
admin_exec(admin, "SET pgsql-monitor_local_dns_cache_ttl=5000");
|
|
admin_exec(admin, "LOAD PGSQL VARIABLES TO RUNTIME");
|
|
|
|
// Confirm the new values actually landed in runtime (regression for the
|
|
// "names commented out in pgsql_thread_variables_names" bug fixed during
|
|
// initial bring-up).
|
|
const long rt_refresh = admin_counter(admin, "runtime_global_variables", "pgsql-monitor_local_dns_cache_refresh_interval");
|
|
const long rt_ttl = admin_counter(admin, "runtime_global_variables", "pgsql-monitor_local_dns_cache_ttl");
|
|
ok(rt_refresh == 500, "pgsql-monitor_local_dns_cache_refresh_interval propagated to runtime (got %ld, expected 500)", rt_refresh);
|
|
ok(rt_ttl == 5000, "pgsql-monitor_local_dns_cache_ttl propagated to runtime (got %ld, expected 5000)", rt_ttl);
|
|
|
|
// Wipe the pgsql_servers table down to a known state. Tests below add
|
|
// servers in a high hostgroup that real test infra doesn't use.
|
|
admin_exec(admin, "DELETE FROM pgsql_servers WHERE hostgroup_id=999");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
// Give the resolver loop time to drop any orphaned bookkeeping entry.
|
|
sleep_seconds(2);
|
|
|
|
// =====================================================================
|
|
// Step 1: IP literal must NOT cause DNS work
|
|
// =====================================================================
|
|
diag("---- Step 1: IP-literal server should not populate the cache");
|
|
DnsCounters before = read_pg_counters(admin);
|
|
admin_exec(admin,
|
|
"INSERT INTO pgsql_servers (hostgroup_id,hostname,port,max_connections,comment) "
|
|
"VALUES (999,'0.0.0.0',7861,10,'pgsql-dns-test ip-literal')");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
sleep_seconds(2);
|
|
DnsCounters after = read_pg_counters(admin);
|
|
ok(after.record_updated == before.record_updated,
|
|
"IP literal does not touch cache (record_updated %ld -> %ld)",
|
|
before.record_updated, after.record_updated);
|
|
|
|
// =====================================================================
|
|
// Step 2: Resolvable hostname populates record_updated
|
|
// =====================================================================
|
|
// We use example.com — IANA-reserved and stable. Public DNS hostnames
|
|
// are also what the upstream MySQL test uses. If the test runner has
|
|
// no outbound DNS, this step (and steps 3 and 4) will be skipped via
|
|
// the wait_for_record_growth helper, which fails the ok(). CI runners
|
|
// have network access; the failure surface from a missing resolver is
|
|
// noisy on purpose so we notice.
|
|
diag("---- Step 2: Resolvable hostname populates the cache");
|
|
before = read_pg_counters(admin);
|
|
admin_exec(admin,
|
|
"INSERT INTO pgsql_servers (hostgroup_id,hostname,port,max_connections,comment) "
|
|
"VALUES (999,'example.com',7861,10,'pgsql-dns-test resolvable')");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
// trigger_dns_cache_update fires on LOAD so refresh shouldn't take a
|
|
// full interval, but give it a generous window.
|
|
bool grew = wait_for_record_growth(admin, before.record_updated, 10);
|
|
ok(grew, "resolver populated cache for example.com (record_updated %ld -> %ld)",
|
|
before.record_updated, admin_counter(admin, "stats_pgsql_global", "PgSQL_Monitor_dns_cache_record_updated"));
|
|
|
|
// =====================================================================
|
|
// Step 3: Client connect through proxy hits the cache
|
|
// =====================================================================
|
|
diag("---- Step 3: Client connect attempt drives lookup counters");
|
|
before = read_pg_counters(admin);
|
|
// Drive several attempts. These will fail at the TCP layer (port 7861
|
|
// is not listening) but the DNS lookup path runs first and that's what
|
|
// we're counting.
|
|
hammer_proxy(3);
|
|
sleep_seconds(1);
|
|
after = read_pg_counters(admin);
|
|
ok(after.queried > before.queried,
|
|
"dns_cache_queried bumped after client connect attempts (%ld -> %ld)",
|
|
before.queried, after.queried);
|
|
ok(after.lookup_success > before.lookup_success,
|
|
"dns_cache_lookup_success bumped (cache hit for example.com) (%ld -> %ld)",
|
|
before.lookup_success, after.lookup_success);
|
|
|
|
// =====================================================================
|
|
// Step 4: Whitespace around hostname is trimmed before resolution
|
|
// =====================================================================
|
|
diag("---- Step 4: Hostname whitespace is trimmed");
|
|
before = read_pg_counters(admin);
|
|
admin_exec(admin,
|
|
"INSERT INTO pgsql_servers (hostgroup_id,hostname,port,max_connections,comment) "
|
|
"VALUES (999,' example.org ',7861,10,'pgsql-dns-test whitespace')");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
grew = wait_for_record_growth(admin, before.record_updated, 10);
|
|
ok(grew, "trimmed hostname ' example.org ' was resolved (record_updated %ld -> %ld)",
|
|
before.record_updated, admin_counter(admin, "stats_pgsql_global", "PgSQL_Monitor_dns_cache_record_updated"));
|
|
|
|
// =====================================================================
|
|
// Step 5: Unresolvable hostname — lookup is queried but produces nothing
|
|
// =====================================================================
|
|
diag("---- Step 5: Unresolvable hostname");
|
|
// Drop everything except an unresolvable host so the cache has a clean
|
|
// shape; then drive a lookup attempt.
|
|
admin_exec(admin, "DELETE FROM pgsql_servers WHERE hostgroup_id=999");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
sleep_seconds(2);
|
|
before = read_pg_counters(admin);
|
|
admin_exec(admin,
|
|
"INSERT INTO pgsql_servers (hostgroup_id,hostname,port,max_connections,comment) "
|
|
"VALUES (999,'this-host-should-not-resolve.invalid',7861,10,'pgsql-dns-test nxdomain')");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
sleep_seconds(3);
|
|
after = read_pg_counters(admin);
|
|
ok(after.record_updated == before.record_updated,
|
|
"unresolvable host did not produce a record (record_updated %ld -> %ld)",
|
|
before.record_updated, after.record_updated);
|
|
|
|
// Now drive a client connect and confirm dns_cache_queried bumps but
|
|
// lookup_success stays flat (cache miss, falls back to hostname).
|
|
before = after;
|
|
hammer_proxy(3);
|
|
sleep_seconds(1);
|
|
after = read_pg_counters(admin);
|
|
ok(after.queried > before.queried,
|
|
"dns_cache_queried bumped on unresolved hostname attempts (%ld -> %ld)",
|
|
before.queried, after.queried);
|
|
ok(after.lookup_success == before.lookup_success,
|
|
"dns_cache_lookup_success stayed flat (cache miss) (%ld -> %ld)",
|
|
before.lookup_success, after.lookup_success);
|
|
|
|
// =====================================================================
|
|
// Step 6: Removing servers drops orphaned cache records
|
|
// =====================================================================
|
|
diag("---- Step 6: Removing pgsql_servers clears orphaned cache records");
|
|
// Add a resolvable host, wait for the cache to populate, then drop and
|
|
// confirm record_updated grew (the bookkeeper signals a remove via the
|
|
// same counter).
|
|
admin_exec(admin, "DELETE FROM pgsql_servers WHERE hostgroup_id=999");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
sleep_seconds(2);
|
|
before = read_pg_counters(admin);
|
|
admin_exec(admin,
|
|
"INSERT INTO pgsql_servers (hostgroup_id,hostname,port,max_connections,comment) "
|
|
"VALUES (999,'example.net',7861,10,'pgsql-dns-test orphan-cleanup')");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
bool added = wait_for_record_growth(admin, before.record_updated, 10);
|
|
long after_add = admin_counter(admin, "stats_pgsql_global", "PgSQL_Monitor_dns_cache_record_updated");
|
|
ok(added, "added example.net (record_updated %ld -> %ld)", before.record_updated, after_add);
|
|
|
|
// Now drop the row and wait for the orphan-cleanup pass.
|
|
admin_exec(admin, "DELETE FROM pgsql_servers WHERE hostgroup_id=999");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
bool removed = wait_for_record_growth(admin, after_add, 10);
|
|
ok(removed,
|
|
"orphan removal bumped record_updated (was %ld, now %ld)",
|
|
after_add, admin_counter(admin, "stats_pgsql_global", "PgSQL_Monitor_dns_cache_record_updated"));
|
|
|
|
// =====================================================================
|
|
// Step 7: Disabled cache (refresh_interval=0) flatlines counters
|
|
// =====================================================================
|
|
diag("---- Step 7: Cache disabled by refresh_interval=0");
|
|
admin_exec(admin, "DELETE FROM pgsql_servers WHERE hostgroup_id=999");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
sleep_seconds(1);
|
|
admin_exec(admin, "SET pgsql-monitor_local_dns_cache_refresh_interval=0");
|
|
admin_exec(admin, "LOAD PGSQL VARIABLES TO RUNTIME");
|
|
// Use a resolvable hostname so the assertion exercises the actual cache
|
|
// path — if disabling the cache regressed (resolver still ran, or the
|
|
// connect path still called into the cache), record_updated and queried
|
|
// would move and the test would fail. An IP literal would bypass DNS
|
|
// regardless of refresh_interval and miss that regression.
|
|
admin_exec(admin,
|
|
"INSERT INTO pgsql_servers (hostgroup_id,hostname,port,max_connections,comment) "
|
|
"VALUES (999,'example.com',7861,10,'pgsql-dns-test cache-off')");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
sleep_seconds(3);
|
|
before = read_pg_counters(admin);
|
|
hammer_proxy(3);
|
|
sleep_seconds(2);
|
|
after = read_pg_counters(admin);
|
|
ok(after.queried == before.queried,
|
|
"cache off: dns_cache_queried unchanged (%ld -> %ld)",
|
|
before.queried, after.queried);
|
|
ok(after.lookup_success == before.lookup_success,
|
|
"cache off: dns_cache_lookup_success unchanged (%ld -> %ld)",
|
|
before.lookup_success, after.lookup_success);
|
|
ok(after.record_updated == before.record_updated,
|
|
"cache off: dns_cache_record_updated unchanged (%ld -> %ld)",
|
|
before.record_updated, after.record_updated);
|
|
|
|
// =====================================================================
|
|
// Step 8: PgSQL cache is independent of MySQL cache
|
|
// =====================================================================
|
|
diag("---- Step 8: PgSQL cache state independent of MySQL cache");
|
|
// Re-enable the cache so the next inserts populate it, then make sure
|
|
// the MySQL counters don't budge in response to pgsql-side activity.
|
|
admin_exec(admin, "SET pgsql-monitor_local_dns_cache_refresh_interval=500");
|
|
admin_exec(admin, "LOAD PGSQL VARIABLES TO RUNTIME");
|
|
const long my_before_q = admin_counter(admin, "stats_mysql_global", "MySQL_Monitor_dns_cache_queried");
|
|
const long my_before_u = admin_counter(admin, "stats_mysql_global", "MySQL_Monitor_dns_cache_record_updated");
|
|
admin_exec(admin, "DELETE FROM pgsql_servers WHERE hostgroup_id=999");
|
|
admin_exec(admin,
|
|
"INSERT INTO pgsql_servers (hostgroup_id,hostname,port,max_connections,comment) "
|
|
"VALUES (999,'example.com',7861,10,'pgsql-dns-test independence')");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
sleep_seconds(3);
|
|
hammer_proxy(2);
|
|
sleep_seconds(1);
|
|
const long my_after_q = admin_counter(admin, "stats_mysql_global", "MySQL_Monitor_dns_cache_queried");
|
|
const long my_after_u = admin_counter(admin, "stats_mysql_global", "MySQL_Monitor_dns_cache_record_updated");
|
|
ok(my_after_q == my_before_q,
|
|
"pgsql activity did not bump MySQL_Monitor_dns_cache_queried (%ld -> %ld)",
|
|
my_before_q, my_after_q);
|
|
ok(my_after_u == my_before_u,
|
|
"pgsql activity did not bump MySQL_Monitor_dns_cache_record_updated (%ld -> %ld)",
|
|
my_before_u, my_after_u);
|
|
|
|
// =====================================================================
|
|
// Cleanup
|
|
// =====================================================================
|
|
admin_exec(admin, "DELETE FROM pgsql_servers WHERE hostgroup_id=999");
|
|
admin_exec(admin, "LOAD PGSQL SERVERS TO RUNTIME");
|
|
{
|
|
std::stringstream q;
|
|
q << "SET pgsql-monitor_local_dns_cache_refresh_interval=" << (orig_refresh > 0 ? orig_refresh : 60000);
|
|
admin_exec(admin, q.str());
|
|
}
|
|
{
|
|
std::stringstream q;
|
|
q << "SET pgsql-monitor_local_dns_cache_ttl=" << (orig_ttl > 0 ? orig_ttl : 300000);
|
|
admin_exec(admin, q.str());
|
|
}
|
|
{
|
|
std::stringstream q;
|
|
q << "SET pgsql-monitor_local_dns_resolver_queue_maxsize=" << (orig_qsize > 0 ? orig_qsize : 128);
|
|
admin_exec(admin, q.str());
|
|
}
|
|
admin_exec(admin, "LOAD PGSQL VARIABLES TO RUNTIME");
|
|
|
|
return exit_status();
|
|
}
|