Added mysql-monitor_replication_lag_group_by_host

If variable mysql-monitor_replication_lag_group_by_host=false: (default)
Monitor will perform 1 replication lag check per server per hostgroup.

If variable mysql-monitor_replication_lag_group_by_host=true:
Monitor will perform 1 replication lag check per server.

This variable need to be set only in setups in which the same server is
configured in many hostgroups, thus reducing the number of checks
pull/3867/head
René Cannaò 4 years ago
parent 33cf9855d9
commit 4bfe3e5878

@ -578,6 +578,7 @@ class MySQL_HostGroups_Manager {
void push_MyConn_to_pool_array(MySQL_Connection **, unsigned int);
void destroy_MyConn_from_pool(MySQL_Connection *, bool _lock=true);
void replication_lag_action_inner(MyHGC *, char*, unsigned int, int);
void replication_lag_action(int, char*, unsigned int, int);
void read_only_action(char *hostname, int port, int read_only);
unsigned int get_servers_table_version();

@ -348,6 +348,7 @@ struct p_th_gauge {
mysql_monitor_read_only_interval,
mysql_monitor_read_only_timeout,
mysql_monitor_writer_is_also_reader,
mysql_monitor_replication_lag_group_by_host,
mysql_monitor_replication_lag_interval,
mysql_monitor_replication_lag_timeout,
mysql_monitor_history,
@ -434,6 +435,7 @@ class MySQL_Threads_Handler
//! ProxySQL session wait timeout. Unit: 'ms'.
bool monitor_wait_timeout;
bool monitor_writer_is_also_reader;
bool monitor_replication_lag_group_by_host;
//! How frequently a replication lag check is performed. Unit: 'ms'.
int monitor_replication_lag_interval;
//! Read only check timeout. Unit: 'ms'.

@ -875,6 +875,7 @@ __thread int mysql_thread___monitor_read_only_timeout;
__thread int mysql_thread___monitor_read_only_max_timeout_count;
__thread bool mysql_thread___monitor_wait_timeout;
__thread bool mysql_thread___monitor_writer_is_also_reader;
__thread int mysql_thread___monitor_replication_lag_group_by_host;
__thread int mysql_thread___monitor_replication_lag_interval;
__thread int mysql_thread___monitor_replication_lag_timeout;
__thread int mysql_thread___monitor_replication_lag_count;
@ -1035,6 +1036,7 @@ extern __thread int mysql_thread___monitor_read_only_timeout;
extern __thread int mysql_thread___monitor_read_only_max_timeout_count;
extern __thread bool mysql_thread___monitor_wait_timeout;
extern __thread bool mysql_thread___monitor_writer_is_also_reader;
extern __thread bool mysql_thread___monitor_replication_lag_group_by_host;
extern __thread int mysql_thread___monitor_replication_lag_interval;
extern __thread int mysql_thread___monitor_replication_lag_timeout;
extern __thread int mysql_thread___monitor_replication_lag_count;

@ -3439,58 +3439,69 @@ void MySQL_HostGroups_Manager::add(MySrvC *mysrvc, unsigned int _hid) {
myhgc->mysrvs->add(mysrvc);
}
void MySQL_HostGroups_Manager::replication_lag_action(int _hid, char *address, unsigned int port, int current_replication_lag) {
GloAdmin->mysql_servers_wrlock();
wrlock();
void MySQL_HostGroups_Manager::replication_lag_action_inner(MyHGC *myhgc, char *address, unsigned int port, int current_replication_lag) {
int j;
MyHGC *myhgc = MyHGC_find(_hid);
if (myhgc) {
for (j=0; j<(int)myhgc->mysrvs->cnt(); j++) {
MySrvC *mysrvc=(MySrvC *)myhgc->mysrvs->servers->index(j);
if (strcmp(mysrvc->address,address)==0 && mysrvc->port==port) {
if (mysrvc->status==MYSQL_SERVER_STATUS_ONLINE) {
if (
// (current_replication_lag==-1 )
// ||
(current_replication_lag>=0 && ((unsigned int)current_replication_lag > mysrvc->max_replication_lag))
) {
// always increase the counter
mysrvc->cur_replication_lag_count += 1;
if (mysrvc->cur_replication_lag_count >= mysql_thread___monitor_replication_lag_count) {
proxy_warning("Shunning server %s:%d from HG %u with replication lag of %d second, count number: '%d'\n", address, port, myhgc->hid, current_replication_lag, mysrvc->cur_replication_lag_count);
mysrvc->status=MYSQL_SERVER_STATUS_SHUNNED_REPLICATION_LAG;
} else {
proxy_info(
"Not shunning server %s:%d from HG %u with replication lag of %d second, count number: '%d' < replication_lag_count: '%d'\n",
address,
port,
myhgc->hid,
current_replication_lag,
mysrvc->cur_replication_lag_count,
mysql_thread___monitor_replication_lag_count
);
}
for (j=0; j<(int)myhgc->mysrvs->cnt(); j++) {
MySrvC *mysrvc=(MySrvC *)myhgc->mysrvs->servers->index(j);
if (strcmp(mysrvc->address,address)==0 && mysrvc->port==port) {
if (mysrvc->status==MYSQL_SERVER_STATUS_ONLINE) {
if (
// (current_replication_lag==-1 )
// ||
(current_replication_lag>=0 && ((unsigned int)current_replication_lag > mysrvc->max_replication_lag))
) {
// always increase the counter
mysrvc->cur_replication_lag_count += 1;
if (mysrvc->cur_replication_lag_count >= mysql_thread___monitor_replication_lag_count) {
proxy_warning("Shunning server %s:%d from HG %u with replication lag of %d second, count number: '%d'\n", address, port, myhgc->hid, current_replication_lag, mysrvc->cur_replication_lag_count);
mysrvc->status=MYSQL_SERVER_STATUS_SHUNNED_REPLICATION_LAG;
} else {
mysrvc->cur_replication_lag_count = 0;
proxy_info(
"Not shunning server %s:%d from HG %u with replication lag of %d second, count number: '%d' < replication_lag_count: '%d'\n",
address,
port,
myhgc->hid,
current_replication_lag,
mysrvc->cur_replication_lag_count,
mysql_thread___monitor_replication_lag_count
);
}
} else {
if (mysrvc->status==MYSQL_SERVER_STATUS_SHUNNED_REPLICATION_LAG) {
if (
(current_replication_lag>=0 && ((unsigned int)current_replication_lag <= mysrvc->max_replication_lag))
||
(current_replication_lag==-2) // see issue 959
) {
mysrvc->status=MYSQL_SERVER_STATUS_ONLINE;
proxy_warning("Re-enabling server %s:%d from HG %u with replication lag of %d second\n", address, port, myhgc->hid, current_replication_lag);
mysrvc->cur_replication_lag_count = 0;
}
mysrvc->cur_replication_lag_count = 0;
}
} else {
if (mysrvc->status==MYSQL_SERVER_STATUS_SHUNNED_REPLICATION_LAG) {
if (
(current_replication_lag>=0 && ((unsigned int)current_replication_lag <= mysrvc->max_replication_lag))
||
(current_replication_lag==-2) // see issue 959
) {
mysrvc->status=MYSQL_SERVER_STATUS_ONLINE;
proxy_warning("Re-enabling server %s:%d from HG %u with replication lag of %d second\n", address, port, myhgc->hid, current_replication_lag);
mysrvc->cur_replication_lag_count = 0;
}
}
goto __exit_replication_lag_action;
}
return;
}
}
}
void MySQL_HostGroups_Manager::replication_lag_action(int _hid, char *address, unsigned int port, int current_replication_lag) {
GloAdmin->mysql_servers_wrlock();
wrlock();
if (mysql_thread___monitor_replication_lag_group_by_host == false) {
// legacy check. 1 check per server per hostgroup
MyHGC *myhgc = MyHGC_find(_hid);
replication_lag_action_inner(myhgc,address,port,current_replication_lag);
} else {
// only 1 check per server, no matter the hostgroup
// all hostgroups must be searched
for (unsigned int i=0; i<MyHostGroups->len; i++) {
MyHGC *myhgc=(MyHGC *)MyHostGroups->index(i);
replication_lag_action_inner(myhgc,address,port,current_replication_lag);
}
}
__exit_replication_lag_action:
wrunlock();
GloAdmin->mysql_servers_wrunlock();
}

@ -3233,7 +3233,12 @@ void * MySQL_Monitor::monitor_replication_lag() {
char *error=NULL;
SQLite3_result *resultset=NULL;
// add support for SSL
char *query=(char *)"SELECT hostgroup_id, hostname, port, max_replication_lag, use_ssl FROM mysql_servers WHERE max_replication_lag > 0 AND status NOT IN (2,3)";
char *query= NULL;
if (mysql_thread___monitor_replication_lag_group_by_host==true) {
query = (char *)"SELECT MIN(hostgroup_id), hostname, port, MIN(max_replication_lag), MAX(use_ssl) FROM mysql_servers WHERE max_replication_lag > 0 AND status NOT IN (2,3) GROUP BY hostname, port";
} else {
query=(char *)"SELECT hostgroup_id, hostname, port, max_replication_lag, use_ssl FROM mysql_servers WHERE max_replication_lag > 0 AND status NOT IN (2,3)";
}
t1=monotonic_time();
if (!GloMTH) return NULL; // quick exit during shutdown/restart

@ -452,6 +452,7 @@ static char * mysql_thread_variables_names[]= {
(char *)"monitor_read_only_interval",
(char *)"monitor_read_only_timeout",
(char *)"monitor_read_only_max_timeout_count",
(char *)"monitor_replication_lag_group_by_host",
(char *)"monitor_replication_lag_interval",
(char *)"monitor_replication_lag_timeout",
(char *)"monitor_replication_lag_count",
@ -990,6 +991,12 @@ th_metrics_map = std::make_tuple(
"Encodes different behaviors for nodes depending on their 'READ_ONLY' flag value.",
metric_tags {}
),
std::make_tuple (
p_th_gauge::mysql_monitor_replication_lag_group_by_host,
"proxysql_monitor_replication_lag_group_by_host",
"Encodes different replication lag check if the same server is in multiple hostgroups.",
metric_tags {}
),
std::make_tuple (
p_th_gauge::mysql_monitor_replication_lag_interval,
"proxysql_mysql_monitor_replication_lag_interval_seconds",
@ -1058,6 +1065,7 @@ MySQL_Threads_Handler::MySQL_Threads_Handler() {
variables.monitor_read_only_interval=1000;
variables.monitor_read_only_timeout=800;
variables.monitor_read_only_max_timeout_count=3;
variables.monitor_replication_lag_group_by_host=false;
variables.monitor_replication_lag_interval=10000;
variables.monitor_replication_lag_timeout=1000;
variables.monitor_replication_lag_count=1;
@ -2082,6 +2090,7 @@ char ** MySQL_Threads_Handler::get_variables_list() {
VariablesPointers_bool["log_mysql_warnings_enabled"] = make_tuple(&variables.log_mysql_warnings_enabled, false);
VariablesPointers_bool["log_unhealthy_connections"] = make_tuple(&variables.log_unhealthy_connections, false);
VariablesPointers_bool["monitor_enabled"] = make_tuple(&variables.monitor_enabled, false);
VariablesPointers_bool["monitor_replication_lag_group_by_host"] = make_tuple(&variables.monitor_replication_lag_group_by_host, false);
VariablesPointers_bool["monitor_wait_timeout"] = make_tuple(&variables.monitor_wait_timeout, false);
VariablesPointers_bool["monitor_writer_is_also_reader"] = make_tuple(&variables.monitor_writer_is_also_reader, false);
VariablesPointers_bool["multiplexing"] = make_tuple(&variables.multiplexing, false);
@ -3935,6 +3944,7 @@ void MySQL_Thread::refresh_variables() {
mysql_thread___monitor_read_only_interval=GloMTH->get_variable_int((char *)"monitor_read_only_interval");
mysql_thread___monitor_read_only_timeout=GloMTH->get_variable_int((char *)"monitor_read_only_timeout");
mysql_thread___monitor_read_only_max_timeout_count=GloMTH->get_variable_int((char *)"monitor_read_only_max_timeout_count");
mysql_thread___monitor_replication_lag_group_by_host=(bool)GloMTH->get_variable_int((char *)"monitor_replication_lag_group_by_host");
mysql_thread___monitor_replication_lag_interval=GloMTH->get_variable_int((char *)"monitor_replication_lag_interval");
mysql_thread___monitor_replication_lag_timeout=GloMTH->get_variable_int((char *)"monitor_replication_lag_timeout");
mysql_thread___monitor_replication_lag_count=GloMTH->get_variable_int((char *)"monitor_replication_lag_count");
@ -5188,6 +5198,7 @@ void MySQL_Threads_Handler::p_update_metrics() {
this->status_variables.p_gauge_array[p_th_gauge::mysql_monitor_read_only_interval]->Set(this->variables.monitor_read_only_interval/1000.0);
this->status_variables.p_gauge_array[p_th_gauge::mysql_monitor_read_only_timeout]->Set(this->variables.monitor_read_only_timeout/1000.0);
this->status_variables.p_gauge_array[p_th_gauge::mysql_monitor_writer_is_also_reader]->Set(this->variables.monitor_writer_is_also_reader);
this->status_variables.p_gauge_array[p_th_gauge::mysql_monitor_replication_lag_group_by_host]->Set(this->variables.monitor_replication_lag_group_by_host);
this->status_variables.p_gauge_array[p_th_gauge::mysql_monitor_replication_lag_interval]->Set(this->variables.monitor_replication_lag_interval/1000.0);
this->status_variables.p_gauge_array[p_th_gauge::mysql_monitor_replication_lag_timeout]->Set(this->variables.monitor_replication_lag_timeout/1000.0);
this->status_variables.p_gauge_array[p_th_gauge::mysql_monitor_history]->Set(this->variables.monitor_history/1000.0);

Loading…
Cancel
Save