From 3c91ac019b87d549a0c4ab1e1fa2acd416402e24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=C3=A9=20Canna=C3=B2?= Date: Sun, 10 Dec 2017 12:39:57 +0100 Subject: [PATCH] Introduced watchdog and automatic restart See https://github.com/sysown/proxysql/wiki/Watchdog --- include/MySQL_Thread.h | 2 + include/proxysql_glovars.hpp | 1 + lib/MySQL_Thread.cpp | 2 + src/main.cpp | 104 ++++++++++++++++++++++++++++++++++- src/proxysql.cfg | 2 +- 5 files changed, 107 insertions(+), 4 deletions(-) diff --git a/include/MySQL_Thread.h b/include/MySQL_Thread.h index fca9d3858..3e56761fb 100644 --- a/include/MySQL_Thread.h +++ b/include/MySQL_Thread.h @@ -6,6 +6,7 @@ #ifdef IDLE_THREADS #include #endif // IDLE_THREADS +#include #define MIN_POLL_LEN 8 #define MIN_POLL_DELETE_RATIO 8 @@ -186,6 +187,7 @@ class MySQL_Thread unsigned long long curtime; unsigned long long pre_poll_time; unsigned long long last_maintenance_time; + std::atomic atomic_curtime; PtrArray *mysql_sessions; PtrArray *mirror_queue_mysql_sessions; PtrArray *mirror_queue_mysql_sessions_cache; diff --git a/include/proxysql_glovars.hpp b/include/proxysql_glovars.hpp index d5217aed4..552944303 100644 --- a/include/proxysql_glovars.hpp +++ b/include/proxysql_glovars.hpp @@ -54,6 +54,7 @@ class ProxySQL_GlobalVariables { char *statsdb_disk; char *errorlog; char *pid; + int restart_on_missing_heartbeats; struct { unsigned long long start_time; bool gdbg; diff --git a/lib/MySQL_Thread.cpp b/lib/MySQL_Thread.cpp index b646fad71..d7713fda5 100644 --- a/lib/MySQL_Thread.cpp +++ b/lib/MySQL_Thread.cpp @@ -2425,6 +2425,7 @@ void MySQL_Thread::run() { #endif // IDLE_THREADS curtime=monotonic_time(); + atomic_curtime=curtime; #ifdef PROXYSQL_MYSQL_PTHREAD_MUTEX pthread_mutex_lock(&thread_mutex); @@ -2734,6 +2735,7 @@ __run_skip_1a: mypolls.poll_timeout=0; // always reset this to 0 . If a session needs a specific timeout, it will set this one curtime=monotonic_time(); + atomic_curtime=curtime; poll_timeout_bool=false; if ( #ifdef IDLE_THREADS diff --git a/src/main.cpp b/src/main.cpp index 09e4e6f4f..0a9215b88 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,6 +22,13 @@ #endif + +// Note: if you are running ProxySQL under gdb, you may consider setting this +// variable immediately to 1 +// Example: +// set disable_watchdog=1 +volatile int disable_watchdog = 0; + void parent_open_error_log() { if (GloVars.global.foreground==false) { int outfd=0; @@ -276,9 +283,20 @@ void * mysql_shared_query_cache_funct(void *arg) { void ProxySQL_Main_process_global_variables(int argc, const char **argv) { GloVars.parse(argc,argv); GloVars.process_opts_pre(); + GloVars.restart_on_missing_heartbeats = 10; // default // alwasy try to open a config file if (GloVars.confFile->OpenFile(GloVars.config_file) == true) { GloVars.configfile_open=true; + const Setting& root = GloVars.confFile->cfg->getRoot(); + if (root.exists("restart_on_missing_heartbeats")==true) { + // restart_on_missing_heartbeats datadir from config file + int restart_on_missing_heartbeats; + bool rc; + rc=root.lookupValue("restart_on_missing_heartbeats", restart_on_missing_heartbeats); + if (rc==true) { + GloVars.restart_on_missing_heartbeats=restart_on_missing_heartbeats; + } + } } else { proxy_warning("Unable to open config file %s\n", GloVars.config_file); // issue #705 } @@ -286,7 +304,7 @@ void ProxySQL_Main_process_global_variables(int argc, const char **argv) { if (GloVars.__cmd_proxysql_datadir==NULL) { // datadir was not specified , try to read config file if (GloVars.configfile_open==true) { - const Setting& root = GloVars.confFile->cfg->getRoot(); + const Setting& root = GloVars.confFile->cfg->getRoot(); if (root.exists("datadir")==true) { // reading datadir from config file std::string datadir; @@ -301,6 +319,20 @@ void ProxySQL_Main_process_global_variables(int argc, const char **argv) { // datadir was not specified in config file GloVars.datadir=strdup(t); } + if (root.exists("restart_on_missing_heartbeats")==true) { + // restart_on_missing_heartbeats datadir from config file + int restart_on_missing_heartbeats; + bool rc; + rc=root.lookupValue("restart_on_missing_heartbeats", restart_on_missing_heartbeats); + if (rc==true) { + GloVars.restart_on_missing_heartbeats=restart_on_missing_heartbeats; + } else { + GloVars.restart_on_missing_heartbeats = 10; // default + } + } else { + // restart_on_missing_heartbeats was not specified in config file + GloVars.restart_on_missing_heartbeats = 10; // default + } } else { // config file not readable GloVars.datadir=strdup(t); @@ -945,8 +977,74 @@ __start_label: #endif } - while (glovars.shutdown==0) { - usleep(500000); // FIXME: TERRIBLE UGLY + { + unsigned int missed_heartbeats = 0; + unsigned long long previous_time = monotonic_time(); + unsigned int inner_loops = 0; + while (glovars.shutdown==0) { + usleep(200000); + if (disable_watchdog) { + continue; + } + unsigned long long curtime = monotonic_time(); + inner_loops++; + if (curtime >= inner_loops*300000 + previous_time ) { + // if this happens, it means that this very simple loop is blocked + // probably we are running under gdb + previous_time = curtime; + inner_loops = 0; + continue; + } + if (GloMTH) { + unsigned long long atomic_curtime = 0; + unsigned long long poll_timeout = (unsigned int)GloMTH->variables.poll_timeout; + unsigned int threads_missing_heartbeat = 0; + poll_timeout += 1000; // add 1 second (rounding up) + poll_timeout *= 1000; // convert to us + if (curtime < previous_time + poll_timeout) { + continue; + } + previous_time = curtime; + inner_loops = 0; + unsigned int i; + if (GloMTH->mysql_threads) { + for (i=0; inum_threads; i++) { + if (GloMTH->mysql_threads[i].worker) { + atomic_curtime = GloMTH->mysql_threads[i].worker->atomic_curtime; + if (curtime > atomic_curtime + poll_timeout) { + threads_missing_heartbeat++; + } + } + } + } +#ifdef IDLE_THREADS + if (GloVars.global.idle_threads) { + if (GloMTH->mysql_threads) { + for (i=0; inum_threads; i++) { + if (GloMTH->mysql_threads_idles[i].worker) { + atomic_curtime = GloMTH->mysql_threads_idles[i].worker->atomic_curtime; + if (curtime > atomic_curtime + poll_timeout) { + threads_missing_heartbeat++; + } + } + } + } + } +#endif + if (threads_missing_heartbeat) { + proxy_error("Watchdog: %u threads missed a heartbeat\n", threads_missing_heartbeat); + missed_heartbeats++; + if (missed_heartbeats >= (unsigned int)GloVars.restart_on_missing_heartbeats) { + if (GloVars.restart_on_missing_heartbeats) { + proxy_error("Watchdog: reached %u missed heartbeats. Aborting!\n", missed_heartbeats); + assert(0); + } + } + } else { + missed_heartbeats = 0; + } + } + } } __shutdown: diff --git a/src/proxysql.cfg b/src/proxysql.cfg index 869986c20..2311ff1d7 100644 --- a/src/proxysql.cfg +++ b/src/proxysql.cfg @@ -5,7 +5,7 @@ # Grammar is also copied at the end of this file - +restart_on_missing_heartbeats=10 datadir="/var/lib/proxysql" admin_variables=