You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
proxysql/lib/MyHGC.cpp

355 lines
13 KiB

#include "../deps/json/json.hpp"
using json = nlohmann::json;
#define PROXYJSON
#include "MySQL_HostGroups_Manager.h"
#ifdef TEST_AURORA
static unsigned long long array_mysrvc_total = 0;
static unsigned long long array_mysrvc_cands = 0;
#endif // TEST_AURORA
extern MySQL_Threads_Handler *GloMTH;
MySrvC *MyHGC::get_random_MySrvC(char * gtid_uuid, uint64_t gtid_trxid, int max_lag_ms, MySQL_Session *sess) {
MySrvC *mysrvc=NULL;
unsigned int j;
unsigned int sum=0;
unsigned int TotalUsedConn=0;
unsigned int l=mysrvs->cnt();
static time_t last_hg_log = 0;
#ifdef TEST_AURORA
unsigned long long a1 = array_mysrvc_total/10000;
array_mysrvc_total += l;
unsigned long long a2 = array_mysrvc_total/10000;
if (a2 > a1) {
fprintf(stderr, "Total: %llu, Candidates: %llu\n", array_mysrvc_total-l, array_mysrvc_cands);
}
#endif // TEST_AURORA
MySrvC *mysrvcCandidates_static[32];
MySrvC **mysrvcCandidates = mysrvcCandidates_static;
unsigned int num_candidates = 0;
bool max_connections_reached = false;
if (l>32) {
mysrvcCandidates = (MySrvC **)malloc(sizeof(MySrvC *)*l);
}
if (l) {
//int j=0;
for (j=0; j<l; j++) {
mysrvc=mysrvs->idx(j);
if (mysrvc->get_status() == MYSQL_SERVER_STATUS_ONLINE) { // consider this server only if ONLINE
if (mysrvc->myhgc->num_online_servers.load(std::memory_order_relaxed) <= mysrvc->myhgc->attributes.max_num_online_servers) { // number of online servers in HG is within configured range
if (mysrvc->ConnectionsUsed->conns_length() < mysrvc->max_connections) { // consider this server only if didn't reach max_connections
if (mysrvc->current_latency_us < (mysrvc->max_latency_us ? mysrvc->max_latency_us : mysql_thread___default_max_latency_ms*1000)) { // consider the host only if not too far
if (gtid_trxid) {
if (MyHGM->gtid_exists(mysrvc, gtid_uuid, gtid_trxid)) {
sum+=mysrvc->weight;
TotalUsedConn+=mysrvc->ConnectionsUsed->conns_length();
mysrvcCandidates[num_candidates]=mysrvc;
num_candidates++;
}
} else {
if (max_lag_ms >= 0) {
if ((unsigned int)max_lag_ms >= mysrvc->aws_aurora_current_lag_us / 1000) {
sum+=mysrvc->weight;
TotalUsedConn+=mysrvc->ConnectionsUsed->conns_length();
mysrvcCandidates[num_candidates]=mysrvc;
num_candidates++;
} else {
sess->thread->status_variables.stvar[st_var_aws_aurora_replicas_skipped_during_query]++;
}
} else {
sum+=mysrvc->weight;
TotalUsedConn+=mysrvc->ConnectionsUsed->conns_length();
mysrvcCandidates[num_candidates]=mysrvc;
num_candidates++;
}
}
}
} else {
max_connections_reached = true;
}
} else {
mysrvc->myhgc->log_num_online_server_count_error();
}
} else {
if (mysrvc->get_status() == MYSQL_SERVER_STATUS_SHUNNED) {
// try to recover shunned servers
if (mysrvc->shunned_automatic && mysql_thread___shun_recovery_time_sec) {
time_t t;
t=time(NULL);
// we do all these changes without locking . We assume the server is not used from long
// even if the server is still in used and any of the follow command fails it is not critical
// because this is only an attempt to recover a server that is probably dead anyway
// the next few lines of code try to solve issue #530
int max_wait_sec = ( mysql_thread___shun_recovery_time_sec * 1000 >= mysql_thread___connect_timeout_server_max ? mysql_thread___connect_timeout_server_max/1000 - 1 : mysql_thread___shun_recovery_time_sec );
if (max_wait_sec < 1) { // min wait time should be at least 1 second
max_wait_sec = 1;
}
if (t > mysrvc->time_last_detected_error && (t - mysrvc->time_last_detected_error) > max_wait_sec) {
if (
(mysrvc->shunned_and_kill_all_connections==false) // it is safe to bring it back online
||
(mysrvc->shunned_and_kill_all_connections==true && mysrvc->ConnectionsUsed->conns_length()==0 && mysrvc->ConnectionsFree->conns_length()==0) // if shunned_and_kill_all_connections is set, ensure all connections are already dropped
) {
#ifdef DEBUG
if (GloMTH->variables.hostgroup_manager_verbose >= 3) {
proxy_info("Unshunning server %s:%d.\n", mysrvc->address, mysrvc->port);
}
#endif
mysrvc->set_status(MYSQL_SERVER_STATUS_ONLINE);
mysrvc->shunned_automatic=false;
mysrvc->shunned_and_kill_all_connections=false;
mysrvc->connect_ERR_at_time_last_detected_error=0;
mysrvc->time_last_detected_error=0;
// note: the following function scans all the hostgroups.
// This is ok for now because we only have a global mutex.
// If one day we implement a mutex per hostgroup (unlikely,
// but possible), this must be taken into consideration
if (mysql_thread___unshun_algorithm == 1) {
MyHGM->unshun_server_all_hostgroups(mysrvc->address, mysrvc->port, t, max_wait_sec, &mysrvc->myhgc->hid);
}
// if a server is taken back online, consider it immediately
if ( mysrvc->current_latency_us < ( mysrvc->max_latency_us ? mysrvc->max_latency_us : mysql_thread___default_max_latency_ms*1000 ) ) { // consider the host only if not too far
if (gtid_trxid) {
if (MyHGM->gtid_exists(mysrvc, gtid_uuid, gtid_trxid)) {
sum+=mysrvc->weight;
TotalUsedConn+=mysrvc->ConnectionsUsed->conns_length();
mysrvcCandidates[num_candidates]=mysrvc;
num_candidates++;
}
} else {
if (max_lag_ms >= 0) {
if ((unsigned int)max_lag_ms >= mysrvc->aws_aurora_current_lag_us/1000) {
sum+=mysrvc->weight;
TotalUsedConn+=mysrvc->ConnectionsUsed->conns_length();
mysrvcCandidates[num_candidates]=mysrvc;
num_candidates++;
}
} else {
sum+=mysrvc->weight;
TotalUsedConn+=mysrvc->ConnectionsUsed->conns_length();
mysrvcCandidates[num_candidates]=mysrvc;
num_candidates++;
}
}
}
}
}
}
}
}
}
if (max_lag_ms > 0) { // we are using AWS Aurora, as this logic is implemented only here
unsigned int min_num_replicas = sess->thread->variables.aurora_max_lag_ms_only_read_from_replicas;
if (min_num_replicas) {
if (num_candidates >= min_num_replicas) { // there are at least N replicas
// we try to remove the writer
unsigned int total_aws_aurora_current_lag_us=0;
for (j=0; j<num_candidates; j++) {
mysrvc = mysrvcCandidates[j];
total_aws_aurora_current_lag_us += mysrvc->aws_aurora_current_lag_us;
}
if (total_aws_aurora_current_lag_us) { // we are just double checking that we don't have all servers with aws_aurora_current_lag_us==0
for (j=0; j<num_candidates; j++) {
mysrvc = mysrvcCandidates[j];
if (mysrvc->aws_aurora_current_lag_us==0) {
sum-=mysrvc->weight;
TotalUsedConn-=mysrvc->ConnectionsUsed->conns_length();
if (j < num_candidates-1) {
mysrvcCandidates[j]=mysrvcCandidates[num_candidates-1];
}
num_candidates--;
}
}
}
}
}
}
if (sum==0) {
// per issue #531 , we try a desperate attempt to bring back online any shunned server
// we do this lowering the maximum wait time to 10%
// most of the follow code is copied from few lines above
time_t t;
t=time(NULL);
int max_wait_sec = ( mysql_thread___shun_recovery_time_sec * 1000 >= mysql_thread___connect_timeout_server_max ? mysql_thread___connect_timeout_server_max/10000 - 1 : mysql_thread___shun_recovery_time_sec/10 );
if (max_wait_sec < 1) { // min wait time should be at least 1 second
max_wait_sec = 1;
}
if (t - last_hg_log > 1) { // log this at most once per second to avoid spamming the logs
last_hg_log = time(NULL);
if (gtid_trxid) {
proxy_error("Hostgroup %u has no servers ready for GTID '%s:%ld'. Waiting for replication...\n", hid, gtid_uuid, gtid_trxid);
} else {
proxy_error("Hostgroup %u has no servers available%s! Checking servers shunned for more than %u second%s\n", hid,
(max_connections_reached ? " or max_connections reached for all servers" : ""), max_wait_sec, max_wait_sec == 1 ? "" : "s");
}
}
for (j=0; j<l; j++) {
mysrvc=mysrvs->idx(j);
if (mysrvc->get_status() == MYSQL_SERVER_STATUS_SHUNNED && mysrvc->shunned_automatic == true) {
if ((t - mysrvc->time_last_detected_error) > max_wait_sec) {
mysrvc->set_status(MYSQL_SERVER_STATUS_ONLINE);
mysrvc->shunned_automatic=false;
mysrvc->connect_ERR_at_time_last_detected_error=0;
mysrvc->time_last_detected_error=0;
// if a server is taken back online, consider it immediately
if ( mysrvc->current_latency_us < ( mysrvc->max_latency_us ? mysrvc->max_latency_us : mysql_thread___default_max_latency_ms*1000 ) ) { // consider the host only if not too far
if (gtid_trxid) {
if (MyHGM->gtid_exists(mysrvc, gtid_uuid, gtid_trxid)) {
sum+=mysrvc->weight;
TotalUsedConn+=mysrvc->ConnectionsUsed->conns_length();
mysrvcCandidates[num_candidates]=mysrvc;
num_candidates++;
}
} else {
if (max_lag_ms >= 0) {
if ((unsigned int)max_lag_ms >= mysrvc->aws_aurora_current_lag_us/1000) {
sum+=mysrvc->weight;
TotalUsedConn+=mysrvc->ConnectionsUsed->conns_length();
mysrvcCandidates[num_candidates]=mysrvc;
num_candidates++;
}
} else {
sum+=mysrvc->weight;
TotalUsedConn+=mysrvc->ConnectionsUsed->conns_length();
mysrvcCandidates[num_candidates]=mysrvc;
num_candidates++;
}
}
}
}
}
}
}
if (sum==0) {
proxy_debug(PROXY_DEBUG_MYSQL_CONNPOOL, 7, "Returning MySrvC NULL because no backend ONLINE or with weight\n");
if (l>32) {
free(mysrvcCandidates);
}
#ifdef TEST_AURORA
array_mysrvc_cands += num_candidates;
#endif // TEST_AURORA
return NULL; // if we reach here, we couldn't find any target
}
/*
unsigned int New_sum=0;
unsigned int New_TotalUsedConn=0;
// we will now scan again to ignore overloaded servers
for (j=0; j<num_candidates; j++) {
mysrvc = mysrvcCandidates[j];
unsigned int len=mysrvc->ConnectionsUsed->conns_length();
if ((len * sum) <= (TotalUsedConn * mysrvc->weight * 1.5 + 1)) {
New_sum+=mysrvc->weight;
New_TotalUsedConn+=len;
} else {
// remove the candidate
if (j+1 < num_candidates) {
mysrvcCandidates[j] = mysrvcCandidates[num_candidates-1];
}
j--;
num_candidates--;
}
}
*/
unsigned int New_sum=sum;
if (New_sum==0) {
proxy_debug(PROXY_DEBUG_MYSQL_CONNPOOL, 7, "Returning MySrvC NULL because no backend ONLINE or with weight\n");
if (l>32) {
free(mysrvcCandidates);
}
#ifdef TEST_AURORA
array_mysrvc_cands += num_candidates;
#endif // TEST_AURORA
return NULL; // if we reach here, we couldn't find any target
}
// latency awareness algorithm is enabled only when compiled with USE_MYSRVC_ARRAY
if (sess && sess->thread->variables.min_num_servers_lantency_awareness) {
if ((int) num_candidates >= sess->thread->variables.min_num_servers_lantency_awareness) {
unsigned int servers_with_latency = 0;
unsigned int total_latency_us = 0;
// scan and verify that all servers have some latency
for (j=0; j<num_candidates; j++) {
mysrvc = mysrvcCandidates[j];
if (mysrvc->current_latency_us) {
servers_with_latency++;
total_latency_us += mysrvc->current_latency_us;
}
}
if (servers_with_latency == num_candidates) {
// all servers have some latency.
// That is good. If any server have no latency, something is wrong
// and we will skip this algorithm
sess->thread->status_variables.stvar[st_var_ConnPool_get_conn_latency_awareness]++;
unsigned int avg_latency_us = 0;
avg_latency_us = total_latency_us/num_candidates;
for (j=0; j<num_candidates; j++) {
mysrvc = mysrvcCandidates[j];
if (mysrvc->current_latency_us > avg_latency_us) {
// remove the candidate
if (j+1 < num_candidates) {
mysrvcCandidates[j] = mysrvcCandidates[num_candidates-1];
}
j--;
num_candidates--;
}
}
// we scan again to adjust weight
New_sum = 0;
for (j=0; j<num_candidates; j++) {
mysrvc = mysrvcCandidates[j];
New_sum+=mysrvc->weight;
}
}
}
}
unsigned int k;
if (New_sum > 32768) {
k=rand()%New_sum;
} else {
k=fastrand()%New_sum;
}
k++;
New_sum=0;
for (j=0; j<num_candidates; j++) {
mysrvc = mysrvcCandidates[j];
New_sum+=mysrvc->weight;
if (k<=New_sum) {
proxy_debug(PROXY_DEBUG_MYSQL_CONNPOOL, 7, "Returning MySrvC %p, server %s:%d\n", mysrvc, mysrvc->address, mysrvc->port);
if (l>32) {
free(mysrvcCandidates);
}
#ifdef TEST_AURORA
array_mysrvc_cands += num_candidates;
#endif // TEST_AURORA
return mysrvc;
}
}
} else {
time_t t = time(NULL);
if (t - last_hg_log > 1) {
last_hg_log = time(NULL);
proxy_error("Hostgroup %u has no servers available!\n", hid);
}
}
proxy_debug(PROXY_DEBUG_MYSQL_CONNPOOL, 7, "Returning MySrvC NULL\n");
if (l>32) {
free(mysrvcCandidates);
}
#ifdef TEST_AURORA
array_mysrvc_cands += num_candidates;
#endif // TEST_AURORA
return NULL; // if we reach here, we couldn't find any target
}