From a2727739e4ec08c84ee1a79b32f6a2e9eb25cd24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Jaramago=20Fern=C3=A1ndez?= Date: Sat, 5 Sep 2020 02:50:02 +0200 Subject: [PATCH] Added new variable 'mysql-monitor_replication_lag_retries' This new variable controls the number of retries that replication lag needs to exceed 'max_replication_lag' in order to set a server SHUNNED. --- include/MySQL_HostGroups_Manager.h | 1 + include/MySQL_Thread.h | 1 + include/proxysql_structs.h | 2 ++ lib/MySQL_HostGroups_Manager.cpp | 21 +++++++++++++++++++-- lib/MySQL_Thread.cpp | 17 +++++++++++++++++ 5 files changed, 40 insertions(+), 2 deletions(-) diff --git a/include/MySQL_HostGroups_Manager.h b/include/MySQL_HostGroups_Manager.h index 7b4ca3672..18d93a1cf 100644 --- a/include/MySQL_HostGroups_Manager.h +++ b/include/MySQL_HostGroups_Manager.h @@ -140,6 +140,7 @@ class MySrvC { // MySQL Server Container unsigned int max_connections_used; // The maximum number of connections that has been opened unsigned int connect_OK; unsigned int connect_ERR; + unsigned int cur_replication_lag_retries; // note that these variables are in microsecond, while user defines max lantency in millisecond unsigned int current_latency_us; unsigned int max_latency_us; diff --git a/include/MySQL_Thread.h b/include/MySQL_Thread.h index 9a5ece971..47a7a33bb 100644 --- a/include/MySQL_Thread.h +++ b/include/MySQL_Thread.h @@ -377,6 +377,7 @@ class MySQL_Threads_Handler bool monitor_writer_is_also_reader; int monitor_replication_lag_interval; int monitor_replication_lag_timeout; + int monitor_replication_lag_retries; int monitor_groupreplication_healthcheck_interval; int monitor_groupreplication_healthcheck_timeout; int monitor_groupreplication_healthcheck_max_timeout_count; diff --git a/include/proxysql_structs.h b/include/proxysql_structs.h index c1ec54ae8..8da9aec1c 100644 --- a/include/proxysql_structs.h +++ b/include/proxysql_structs.h @@ -799,6 +799,7 @@ __thread bool mysql_thread___monitor_wait_timeout; __thread bool mysql_thread___monitor_writer_is_also_reader; __thread int mysql_thread___monitor_replication_lag_interval; __thread int mysql_thread___monitor_replication_lag_timeout; +__thread int mysql_thread___monitor_replication_lag_retries; __thread int mysql_thread___monitor_groupreplication_healthcheck_interval; __thread int mysql_thread___monitor_groupreplication_healthcheck_timeout; __thread int mysql_thread___monitor_groupreplication_healthcheck_max_timeout_count; @@ -946,6 +947,7 @@ extern __thread bool mysql_thread___monitor_wait_timeout; extern __thread bool mysql_thread___monitor_writer_is_also_reader; extern __thread int mysql_thread___monitor_replication_lag_interval; extern __thread int mysql_thread___monitor_replication_lag_timeout; +extern __thread int mysql_thread___monitor_replication_lag_retries; extern __thread int mysql_thread___monitor_groupreplication_healthcheck_interval; extern __thread int mysql_thread___monitor_groupreplication_healthcheck_timeout; extern __thread int mysql_thread___monitor_groupreplication_healthcheck_max_timeout_count; diff --git a/lib/MySQL_HostGroups_Manager.cpp b/lib/MySQL_HostGroups_Manager.cpp index 3982ecea1..f10a87822 100644 --- a/lib/MySQL_HostGroups_Manager.cpp +++ b/lib/MySQL_HostGroups_Manager.cpp @@ -840,6 +840,7 @@ MySrvC::MySrvC(char *add, uint16_t p, uint16_t gp, unsigned int _weight, enum My max_connections=_max_connections; max_replication_lag=_max_replication_lag; use_ssl=_use_ssl; + cur_replication_lag_retries=0; max_latency_us=_max_latency_ms*1000; current_latency_us=0; aws_aurora_current_lag_us = 0; @@ -3199,8 +3200,23 @@ void MySQL_HostGroups_Manager::replication_lag_action(int _hid, char *address, u // || (current_replication_lag>=0 && ((unsigned int)current_replication_lag > mysrvc->max_replication_lag)) ) { - proxy_warning("Shunning server %s:%d from HG %u with replication lag of %d second\n", address, port, myhgc->hid, current_replication_lag); - mysrvc->status=MYSQL_SERVER_STATUS_SHUNNED_REPLICATION_LAG; + if (mysrvc->cur_replication_lag_retries >= GloMTH->variables.monitor_replication_lag_retries) { + proxy_warning("Shunning server %s:%d from HG %u with replication lag of %d second, retry number: '%d'\n", address, port, myhgc->hid, current_replication_lag, mysrvc->cur_replication_lag_retries); + mysrvc->status=MYSQL_SERVER_STATUS_SHUNNED_REPLICATION_LAG; + } else { + proxy_info( + "Not shunning server %s:%d from HG %u with replication lag of %d second, retry number: '%d' < replication_lag_retries: '%d'\n", + address, + port, + myhgc->hid, + current_replication_lag, + mysrvc->cur_replication_lag_retries, + GloMTH->variables.monitor_replication_lag_retries + ); + mysrvc->cur_replication_lag_retries += 1; + } + } else { + mysrvc->cur_replication_lag_retries = 0; } } else { if (mysrvc->status==MYSQL_SERVER_STATUS_SHUNNED_REPLICATION_LAG) { @@ -3211,6 +3227,7 @@ void MySQL_HostGroups_Manager::replication_lag_action(int _hid, char *address, u ) { mysrvc->status=MYSQL_SERVER_STATUS_ONLINE; proxy_warning("Re-enabling server %s:%d from HG %u with replication lag of %d second\n", address, port, myhgc->hid, current_replication_lag); + mysrvc->cur_replication_lag_retries = 0; } } } diff --git a/lib/MySQL_Thread.cpp b/lib/MySQL_Thread.cpp index 30e5357ea..4d990ed6e 100644 --- a/lib/MySQL_Thread.cpp +++ b/lib/MySQL_Thread.cpp @@ -438,6 +438,7 @@ static char * mysql_thread_variables_names[]= { (char *)"monitor_read_only_max_timeout_count", (char *)"monitor_replication_lag_interval", (char *)"monitor_replication_lag_timeout", + (char *)"monitor_replication_lag_retries", (char *)"monitor_groupreplication_healthcheck_interval", (char *)"monitor_groupreplication_healthcheck_timeout", (char *)"monitor_groupreplication_healthcheck_max_timeout_count", @@ -1007,6 +1008,7 @@ MySQL_Threads_Handler::MySQL_Threads_Handler() { variables.monitor_read_only_max_timeout_count=3; variables.monitor_replication_lag_interval=10000; variables.monitor_replication_lag_timeout=1000; + variables.monitor_replication_lag_retries=0; variables.monitor_groupreplication_healthcheck_interval=5000; variables.monitor_groupreplication_healthcheck_timeout=800; variables.monitor_groupreplication_healthcheck_max_timeout_count=3; @@ -1339,6 +1341,7 @@ int MySQL_Threads_Handler::get_variable_int(const char *name) { if (!strcmp(name,"monitor_read_only_max_timeout_count")) return (int)variables.monitor_read_only_max_timeout_count; if (!strcmp(name,"monitor_replication_lag_interval")) return (int)variables.monitor_replication_lag_interval; if (!strcmp(name,"monitor_replication_lag_timeout")) return (int)variables.monitor_replication_lag_timeout; + if (!strcmp(name,"monitor_replication_lag_retries")) return (int)variables.monitor_replication_lag_retries; } if (a == 'g') { char b = name[9]; @@ -1697,6 +1700,10 @@ char * MySQL_Threads_Handler::get_variable(char *name) { // this is the public f sprintf(intbuf,"%d",variables.monitor_replication_lag_timeout); return strdup(intbuf); } + if (!strcasecmp(name,"monitor_replication_lag_retries")) { + sprintf(intbuf,"%d",variables.monitor_replication_lag_retries); + return strdup(intbuf); + } if (!strcasecmp(name,"monitor_groupreplication_healthcheck_interval")) { sprintf(intbuf,"%d",variables.monitor_groupreplication_healthcheck_interval); return strdup(intbuf); @@ -2254,6 +2261,15 @@ bool MySQL_Threads_Handler::set_variable(char *name, const char *value) { // thi return false; } } + if (!strcasecmp(name,"monitor_replication_lag_retries")) { + int intv=atoi(value); + if (intv >= 0 && intv <= std::numeric_limits::max()) { + variables.monitor_replication_lag_retries=intv; + return true; + } else { + return false; + } + } if (!strcasecmp(name,"monitor_groupreplication_healthcheck_interval")) { int intv=atoi(value); if (intv >= 50 && intv <= 7*24*3600*1000) { @@ -4869,6 +4885,7 @@ void MySQL_Thread::refresh_variables() { mysql_thread___monitor_read_only_max_timeout_count=GloMTH->get_variable_int((char *)"monitor_read_only_max_timeout_count"); mysql_thread___monitor_replication_lag_interval=GloMTH->get_variable_int((char *)"monitor_replication_lag_interval"); mysql_thread___monitor_replication_lag_timeout=GloMTH->get_variable_int((char *)"monitor_replication_lag_timeout"); + mysql_thread___monitor_replication_lag_retries=GloMTH->get_variable_int((char *)"monitor_replication_lag_retries"); mysql_thread___monitor_groupreplication_healthcheck_interval=GloMTH->get_variable_int((char *)"monitor_groupreplication_healthcheck_interval"); mysql_thread___monitor_groupreplication_healthcheck_timeout=GloMTH->get_variable_int((char *)"monitor_groupreplication_healthcheck_timeout"); mysql_thread___monitor_groupreplication_healthcheck_max_timeout_count=GloMTH->get_variable_int((char *)"monitor_groupreplication_healthcheck_max_timeout_count");