From a2c529b9d14898bdfbacaf00165c2ba42941d849 Mon Sep 17 00:00:00 2001 From: Timothy Messier Date: Fri, 17 May 2024 11:59:50 -0400 Subject: [PATCH] fix(controller): Use max request duration cluster listener (#4805) The cluster listener supports a `max_request_duration` option, and defaults to 90 seconds. However, this configuration was not being use when processing grpc requests from the listener. This meant that requests from worker to controller could hang indefinitely and contributed to the recently fixed issue with database connections being stuck in `idle in transaction`. This commit adds a grpc interceptor that ensures each request has a deadline set to the max request duration for the listener. Refs: 29cd7bc9d6128502f918b2469c71eb6bc3751d72 --- CHANGELOG.md | 18 ++++++++++++++++++ internal/daemon/controller/interceptor.go | 10 ++++++++++ internal/daemon/controller/listeners.go | 1 + 3 files changed, 29 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6381ff7c0..1c6ea84bd8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,24 @@ Canonical reference for changes, improvements, and bugfixes for Boundary. ## Next +### Bug Fixes + +* Fix a dead lock issue where the controller could get stuck with all of its + available database connections being stuck in `idle in transaction`. + If a controller is configured to have a `max_open_connections`, and was under + sufficient load in the form of requests from workers interacting with + sessions, like in the form of authorizing new session connections, the + controller could get stuck after consuming all of the database connections, + leaving them in the `idle in transaction` state. This was due to a + combination of issues, including the lack of a request timeout for worker to + controller grpc requests, and the session repository attempting to use a + separate database connection to retrieve a kms.Wrapper after already starting + a database transaction. The fixes move these kms operations outside of the + transaction and set a max request duration for the grpc requests based on + the cluster's listener configuration. + ([PR](https://github.com/hashicorp/boundary/pull/4803) and + [PR](https://github.com/hashicorp/boundary/pull/4805)) + ## 0.16.0 (2024/04/30) ### New and Improved diff --git a/internal/daemon/controller/interceptor.go b/internal/daemon/controller/interceptor.go index 136aabc524..dc521ef163 100644 --- a/internal/daemon/controller/interceptor.go +++ b/internal/daemon/controller/interceptor.go @@ -9,6 +9,7 @@ import ( "net/http" "reflect" "runtime/debug" + "time" grpc_recovery "github.com/grpc-ecosystem/go-grpc-middleware/recovery" "github.com/hashicorp/boundary/globals" @@ -496,6 +497,15 @@ func eventsResponseInterceptor( } } +func requestMaxDurationInterceptor(_ context.Context, maxRequestDuration time.Duration) grpc.UnaryServerInterceptor { + const op = "controller.requestMaxDurationInterceptor" + return func(interceptorCtx context.Context, req any, _ *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (any, error) { + withTimeout, cancel := context.WithTimeout(interceptorCtx, maxRequestDuration) + defer cancel() + return handler(withTimeout, req) + } +} + func workerRequestInfoInterceptor(ctx context.Context, eventer *event.Eventer) (grpc.UnaryServerInterceptor, error) { const op = "worker.requestInfoInterceptor" if eventer == nil { diff --git a/internal/daemon/controller/listeners.go b/internal/daemon/controller/listeners.go index f6411ac2e3..d6982ffa35 100644 --- a/internal/daemon/controller/listeners.go +++ b/internal/daemon/controller/listeners.go @@ -226,6 +226,7 @@ func (c *Controller) configureForCluster(ln *base.ServerListener) (func(), error grpc.UnaryInterceptor( grpc_middleware.ChainUnaryServer( workerReqInterceptor, + requestMaxDurationInterceptor(c.baseContext, ln.Config.MaxRequestDuration), eventsRequestInterceptor(c.baseContext), // before we get started, send the required events with the request eventsResponseInterceptor(c.baseContext), // as we finish, send the required events with the response ),