From fc9ab855da4342a01457d1ff734c7dd44f343424 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Canna=C3=B2?= Date: Sun, 24 Apr 2016 02:28:53 +0000 Subject: [PATCH] First draft for bugs #543 When a server isn't responding to ping, it is flagged as shunned and all the connections need to be dropped. All the connections must be dropped before the server is brought back online --- include/MySQL_HostGroups_Manager.h | 3 ++ lib/MySQL_HostGroups_Manager.cpp | 65 +++++++++++++++++++++++++++--- lib/MySQL_Monitor.cpp | 51 ++++++++++++++++++++++- lib/MySQL_Session.cpp | 7 +++- lib/mysql_connection.cpp | 7 +++- 5 files changed, 123 insertions(+), 10 deletions(-) diff --git a/include/MySQL_HostGroups_Manager.h b/include/MySQL_HostGroups_Manager.h index 865514980..862a0fd33 100644 --- a/include/MySQL_HostGroups_Manager.h +++ b/include/MySQL_HostGroups_Manager.h @@ -55,12 +55,14 @@ class MySrvC { // MySQL Server Container unsigned long long bytes_sent; unsigned long long bytes_recv; bool shunned_automatic; + bool shunned_and_kill_all_connections; // if a serious failure is detected, this will cause all connections to die even if the server is just shunned //uint8_t charset; MySrvConnList *ConnectionsUsed; MySrvConnList *ConnectionsFree; MySrvC(char *, uint16_t, unsigned int, enum MySerStatus, unsigned int, unsigned int _max_connections, unsigned int _max_replication_lag); ~MySrvC(); void connect_error(int); + void shun_and_killall(); }; class MySrvList { // MySQL Server List @@ -153,6 +155,7 @@ class MySQL_HostGroups_Manager { void replication_lag_action(int, char*, unsigned int, int); void read_only_action(char *hostname, int port, int read_only); unsigned int get_servers_table_version(); + void shun_and_killall(char *hostname, int port); }; #endif /* __CLASS_MYSQL_HOSTGROUPS_MANAGER_H */ diff --git a/lib/MySQL_HostGroups_Manager.cpp b/lib/MySQL_HostGroups_Manager.cpp index 65d5eee7c..4d1649033 100644 --- a/lib/MySQL_HostGroups_Manager.cpp +++ b/lib/MySQL_HostGroups_Manager.cpp @@ -119,6 +119,7 @@ MySrvC::MySrvC(char *add, uint16_t p, unsigned int _weight, enum MySerStatus _st time_last_detected_error=0; connect_ERR_at_time_last_detected_error=0; shunned_automatic=false; + shunned_and_kill_all_connections=false; // false to default //charset=_charset; myhgc=NULL; ConnectionsUsed=new MySrvConnList(this); @@ -154,6 +155,12 @@ void MySrvC::connect_error(int err_num) { } } +void MySrvC::shun_and_killall() { + status=MYSQL_SERVER_STATUS_SHUNNED; + shunned_automatic=true; + shunned_and_kill_all_connections=true; +} + MySrvC::~MySrvC() { if (address) free(address); delete ConnectionsUsed; @@ -620,12 +627,19 @@ MySrvC *MyHGC::get_random_MySrvC() { max_wait_sec = 1; } if ((t - mysrvc->time_last_detected_error) > max_wait_sec) { - mysrvc->status=MYSQL_SERVER_STATUS_ONLINE; - mysrvc->shunned_automatic=false; - mysrvc->connect_ERR_at_time_last_detected_error=0; - mysrvc->time_last_detected_error=0; - // if a server is taken back online, consider it immediately - sum+=mysrvc->weight; + if ( + (mysrvc->shunned_and_kill_all_connections==false) // it is safe to bring it back online + || + (mysrvc->shunned_and_kill_all_connections==true && mysrvc->ConnectionsUsed->conns->len==0 && mysrvc->ConnectionsFree->conns->len==0) // if shunned_and_kill_all_connections is set, ensure all connections are already dropped + ) { + mysrvc->status=MYSQL_SERVER_STATUS_ONLINE; + mysrvc->shunned_automatic=false; + mysrvc->shunned_and_kill_all_connections=false; + mysrvc->connect_ERR_at_time_last_detected_error=0; + mysrvc->time_last_detected_error=0; + // if a server is taken back online, consider it immediately + sum+=mysrvc->weight; + } } } } @@ -1024,3 +1038,42 @@ void MySQL_HostGroups_Manager::read_only_action(char *hostname, int port, int re free(query); } + + + +// shun_and_killall +// this function is called only from MySQL_Monitor::monitor_ping() +// it temporary disables a host that is not responding to pings, and mark the host in a way that when used the connection will be dropped +void MySQL_HostGroups_Manager::shun_and_killall(char *hostname, int port) { + wrlock(); + MySrvC *mysrvc=NULL; + for (unsigned int i=0; ilen; i++) { + MyHGC *myhgc=(MyHGC *)MyHostGroups->index(i); + unsigned int j; + unsigned int sum=0; + unsigned int l=myhgc->mysrvs->cnt(); + if (l) { + for (j=0; jmysrvs->idx(j); + if (mysrvc->port==port && strcmp(mysrvc->address,hostname)==0) { + switch (mysrvc->status) { + case MYSQL_SERVER_STATUS_SHUNNED: + if (mysrvc->shunned_automatic==false) { + break; + } + case MYSQL_SERVER_STATUS_ONLINE: + case MYSQL_SERVER_STATUS_OFFLINE_SOFT: + mysrvc->status=MYSQL_SERVER_STATUS_SHUNNED; + mysrvc->shunned_automatic=true; + mysrvc->shunned_and_kill_all_connections=true; + mysrvc->ConnectionsFree->drop_all_connections(); + break; + default: + break; + } + } + } + } + } + wrunlock(); +} diff --git a/lib/MySQL_Monitor.cpp b/lib/MySQL_Monitor.cpp index dd25e1f84..d33e60dde 100644 --- a/lib/MySQL_Monitor.cpp +++ b/lib/MySQL_Monitor.cpp @@ -979,11 +979,60 @@ __end_monitor_ping_loop: free(sds); } - if (resultset) + if (resultset) { delete resultset; + resultset=NULL; + } event_base_free(libevent_base); + // now it is time to shun all problematic hosts + query=(char *)"SELECT DISTINCT a.hostname, a.port FROM mysql_servers a JOIN monitor.mysql_server_ping_log b ON a.hostname=b.hostname WHERE status!='OFFLINE_HARD' AND b.ping_error IS NOT NULL"; + proxy_debug(PROXY_DEBUG_ADMIN, 4, "%s\n", query); + admindb->execute_statement(query, &error , &cols , &affected_rows , &resultset); + if (error) { + proxy_error("Error on %s : %s\n", query, error); + } else { + // get all addresses and ports + int i=0; + int j=0; + char **addresses=(char **)malloc(resultset->rows_count * sizeof(char *)); + char **ports=(char **)malloc(resultset->rows_count * sizeof(char *)); + for (std::vector::iterator it = resultset->rows.begin() ; it != resultset->rows.end(); ++it) { + SQLite3_row *r=*it; + addresses[i]=strdup(r->fields[0]); + ports[i]=strdup(r->fields[1]); + i++; + } + if (resultset) { + delete resultset; + resultset=NULL; + } + char *new_query=(char *)"SELECT 1 FROM (SELECT hostname,port FROM monitor.mysql_server_ping_log WHERE hostname='%s' AND port='%s' ORDER BY time_start DESC LIMIT %d) a GROUP BY hostname,port HAVING COUNT(*)=%d"; + for (j=0;iexecute_statement(buff, &error , &cols , &affected_rows , &resultset); + free(buff); + if (!error) { + if (resultset) { + if (resultset->rows_count) { + // disable host + MyHGM->shun_and_killall(addresses[j],atoi(ports[j])); + } + delete resultset; + resultset=NULL; + } + } + } + while (i) { // now free all the addresses/ports + i--; + free(addresses[i]); + free(ports[i]); + } + free(addresses); + free(ports); + } __sleep_monitor_ping_loop: t2=monotonic_time(); diff --git a/lib/MySQL_Session.cpp b/lib/MySQL_Session.cpp index 0afc87bf4..1bf3ffef4 100644 --- a/lib/MySQL_Session.cpp +++ b/lib/MySQL_Session.cpp @@ -1043,8 +1043,11 @@ handler_again: } else { if (rc==-1) { // the query failed - if (myconn->parent->status==MYSQL_SERVER_STATUS_OFFLINE_HARD) { - // the query failed because the server is offline hard + if ( + (myconn->parent->status==MYSQL_SERVER_STATUS_OFFLINE_HARD) // the query failed because the server is offline hard + || + (myconn->parent->status==MYSQL_SERVER_STATUS_SHUNNED && myconn->parent->shunned_automatic==true && myconn->parent->shunned_and_kill_all_connections==true) // the query failed because the server is shunned due to a serious failure + ) { if (mysql_thread___connect_timeout_server_max) { myds->max_connect_time=thread->curtime+mysql_thread___connect_timeout_server_max*1000; } diff --git a/lib/mysql_connection.cpp b/lib/mysql_connection.cpp index 2f4233632..c9260ad5a 100644 --- a/lib/mysql_connection.cpp +++ b/lib/mysql_connection.cpp @@ -796,7 +796,12 @@ int MySQL_Connection::async_query(short event, char *stmt, unsigned long length) PROXY_TRACE(); assert(mysql); assert(ret_mysql); - if (parent->status==MYSQL_SERVER_STATUS_OFFLINE_HARD) + if ( + (parent->status==MYSQL_SERVER_STATUS_OFFLINE_HARD) // the server is OFFLINE as specific by the user + || + (parent->status==MYSQL_SERVER_STATUS_SHUNNED && parent->shunned_automatic==true && parent->shunned_and_kill_all_connections==true) // the server is SHUNNED due to a serious issue + ) { + } return -1; switch (async_state_machine) { case ASYNC_QUERY_END: