Simplify worker auth rotation timing logic (#3237)

* Simplify worker auth rotation timing logic * Add some jitter
3 years ago · e0787f4de0
parent 6f4f51cc9f
commit e0787f4de0
4 changed files with 65 additions and 35 deletions
--- a/internal/daemon/controller/controller.go
+++ b/internal/daemon/controller/controller.go
@ -144,6 +144,7 @@ type Controller struct {
 }

 func New(ctx context.Context, conf *Config) (*Controller, error) {
+	const op = "controller.New"
 	metric.InitializeApiCollectors(conf.PrometheusRegisterer)
 	c := &Controller{
 		conf:                    conf,
@ -395,7 +396,7 @@ func New(ctx context.Context, conf *Config) (*Controller, error) {
 	}
 	_, err = server.RotateRoots(ctx, serversRepo, nodeenrollment.WithCertificateLifetime(conf.TestOverrideWorkerAuthCaCertificateLifetime))
 	if err != nil {
-		return nil, fmt.Errorf("unable to ensure worker auth roots exist: %w", err)
+		event.WriteSysEvent(ctx, op, "unable to ensure worker auth roots exist, may be due to multiple controllers starting at once, continuing")
 	}

 	if downstreamersFactory != nil {
--- a/internal/daemon/controller/testing.go
+++ b/internal/daemon/controller/testing.go
@ -473,6 +473,9 @@ type TestControllerOpts struct {
 	// The amount of time between the scheduler waking up to run it's registered
 	// jobs.
 	SchedulerRunJobInterval time.Duration
+
+	// The time to use for CA certificate lifetime for worker auth
+	WorkerAuthCaCertificateLifetime time.Duration
 }

 func NewTestController(t testing.TB, opts *TestControllerOpts) *TestController {
@ -774,6 +777,7 @@ func TestControllerConfig(t testing.TB, ctx context.Context, tc *TestController,
 		RawConfig:                    opts.Config,
 		Server:                       tc.b,
 		DisableAuthorizationFailures: opts.DisableAuthorizationFailures,
+		TestOverrideWorkerAuthCaCertificateLifetime: opts.WorkerAuthCaCertificateLifetime,
 	}
 }

--- a/internal/daemon/worker/auth_rotation.go
+++ b/internal/daemon/worker/auth_rotation.go
@ -8,6 +8,8 @@ import (
 	"crypto/x509"
 	"errors"
 	"fmt"
+	"math/rand"
+	"sync/atomic"
 	"time"

 	berrors "github.com/hashicorp/boundary/internal/errors"
@ -17,6 +19,18 @@ import (
 	"github.com/hashicorp/nodeenrollment/types"
 )

+// The default time to use when we encounter an error or some other reason
+// we can't get a better reset time
+const defaultAuthRotationResetDuration = 5 * time.Second
+
+// AuthRotationResetDuration allows us to view it from tests, which allows us to
+// get test timing right. It'll start at 0 which will cause us to run
+// immediately.
+var AuthRotationResetDuration time.Duration
+
+// AuthRotationNextRotation is useful in tests to understand how long to sleep
+var AuthRotationNextRotation atomic.Pointer[time.Time]
+
 func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) {
 	const op = "worker.(Worker).startAuthRotationTicking"

@ -25,22 +39,27 @@ func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) {
 		return
 	}

-	// The default time to use when we encounter an error or some other reason
-	// we can't get a better reset time
-	const defaultResetDuration = time.Hour
-	var resetDuration time.Duration // will start at 0, so we run immediately
-	timer := time.NewTimer(defaultResetDuration)
+	// Get a valid value in
+	startNow := time.Now()
+	AuthRotationNextRotation.Store(&startNow)
+
+	timer := time.NewTimer(defaultAuthRotationResetDuration)
 	lastRotation := time.Time{}.UTC()
 	for {
-		// You're not supposed to call reset on timers that haven't been stopped or
-		// expired, so we stop it here and reset it to the current resetDuration. That way if
-		// we want to adjust time, e.g. for tests, we can set the resetDuration
-		// value from within the loop
+		// Per the example in the docs, if you stop a timer you should drain the
+		// channel if it returns false. However, their example of blindly
+		// reading from the channel can deadlock. So we just do a select here to
+		// be safer.
 		timer.Stop()
+		select {
+		case <-timer.C:
+		default:
+		}
+
 		if w.TestOverrideAuthRotationPeriod != 0 {
-			resetDuration = w.TestOverrideAuthRotationPeriod
+			AuthRotationResetDuration = w.TestOverrideAuthRotationPeriod
 		}
-		timer.Reset(resetDuration)
+		timer.Reset(AuthRotationResetDuration)

 		select {
 		case <-cancelCtx.Done():
@ -48,7 +67,10 @@ func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) {
 			return

 		case <-timer.C:
-			resetDuration = defaultResetDuration
+			// Add some jitter in case there is some issue to prevent thundering
+			// herd
+			jitter := time.Duration(rand.Intn(6)) * time.Second
+			AuthRotationResetDuration = defaultAuthRotationResetDuration + jitter

 			// Check if it's time to rotate and if not don't do anything
 			currentNodeCreds, err := types.LoadNodeCredentials(
@ -59,18 +81,15 @@ func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) {
 			)
 			if err != nil {
 				if errors.Is(err, nodeenrollment.ErrNotFound) {
-					// Be silent, but check again soon
-					resetDuration = 5 * time.Second
+					// Be silent as we likely haven't authorized yet
 					continue
 				}
 				event.WriteError(cancelCtx, op, err)
-				resetDuration = 5 * time.Second
 				continue
 			}

 			if currentNodeCreds == nil {
 				event.WriteSysEvent(cancelCtx, op, "no error loading worker pki auth creds but nil creds, skipping rotation")
-				resetDuration = 5 * time.Second
 				continue
 			}

@ -81,7 +100,6 @@ func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) {
 				// again fairly quickly because this may also be prior to the
 				// first rotation and we want to ensure the validity period is
 				// checked soon.
-				resetDuration = 5 * time.Second
 				continue
 			}

@ -167,21 +185,22 @@ func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) {
 			)...,
 			)

-			// See if we don't need to do anything
+			// See if we don't need to do anything, and if so calculate reset
+			// duration and loop back
 			if !lastRotation.IsZero() && now.Before(nextRotation) {
-				resetDuration = lastRotation.Add(rotationInterval / 2).Sub(now)
+				AuthRotationResetDuration = lastRotation.Add(rotationInterval / 2).Sub(now)
 				continue
 			}

 			newRotationInterval, err := rotateWorkerAuth(cancelCtx, w, currentNodeCreds)
 			if err != nil {
-				resetDuration = 5 * time.Second
 				event.WriteError(cancelCtx, op, err)
 				continue
 			}
 			lastRotation = now
 			nextRotation = lastRotation.Add(newRotationInterval / 2)
-			resetDuration = nextRotation.Sub(now)
+			AuthRotationNextRotation.Store(&nextRotation)
+			AuthRotationResetDuration = nextRotation.Sub(now)
 			event.WriteSysEvent(cancelCtx, op, "worker credentials rotated", "next_rotation", nextRotation)
 		}
 	}
--- a/internal/daemon/worker/auth_rotation_test.go
+++ b/internal/daemon/worker/auth_rotation_test.go
@ -30,27 +30,27 @@ func TestRotationTicking(t *testing.T) {
 		Level: hclog.Trace,
 	})

+	const rotationPeriod = 20 * time.Second
+
 	conf, err := config.DevController()
 	require.NoError(err)

 	c := controller.NewTestController(t, &controller.TestControllerOpts{
-		Config: conf,
-		Logger: logger.Named("controller"),
+		Config:                          conf,
+		Logger:                          logger.Named("controller"),
+		WorkerAuthCaCertificateLifetime: rotationPeriod,
 	})
 	t.Cleanup(c.Shutdown)

-	const rotationPeriod = 20 * time.Second
-
 	// names should not be set when using pki workers
 	wConf, err := config.DevWorker()
 	require.NoError(err)
 	wConf.Worker.Name = ""
 	wConf.Worker.InitialUpstreams = c.ClusterAddrs()
 	w := worker.NewTestWorker(t, &worker.TestWorkerOpts{
-		InitialUpstreams:   c.ClusterAddrs(),
-		Logger:             logger.Named("worker"),
-		AuthRotationPeriod: rotationPeriod,
-		Config:             wConf,
+		InitialUpstreams: c.ClusterAddrs(),
+		Logger:           logger.Named("worker"),
+		Config:           wConf,
 	})
 	t.Cleanup(w.Shutdown)

@ -76,17 +76,21 @@ func TestRotationTicking(t *testing.T) {
 	// Decode the proto into the request and create the worker
 	req := new(types.FetchNodeCredentialsRequest)
 	require.NoError(proto.Unmarshal(reqBytes, req))
-	worker, err := serversRepo.CreateWorker(c.Context(), &server.Worker{
+	newWorker, err := serversRepo.CreateWorker(c.Context(), &server.Worker{
 		Worker: &store.Worker{
 			ScopeId: scope.Global.String(),
 		},
 	}, server.WithFetchNodeCredentialsRequest(req))
 	require.NoError(err)

-	// Verify we see one authorized set of credentials now
+	// Wait for a short while; there will be an initial rotation of credentials
+	// after authentication
+	time.Sleep(rotationPeriod / 2)
+
+	// Verify we see authorized credentials now
 	auths, err = workerAuthRepo.List(c.Context(), (*types.NodeInformation)(nil))
 	require.NoError(err)
-	assert.Len(auths, 1)
+	assert.Len(auths, 2)
 	// Fetch creds and store current key
 	currNodeCreds, err := types.LoadNodeCredentials(
 		w.Context(),
@ -103,7 +107,9 @@ func TestRotationTicking(t *testing.T) {
 	// Now we wait and check that we see new values in the DB and different
 	// creds on the worker after each rotation period
 	for i := 2; i < 5; i++ {
-		time.Sleep(rotationPeriod)
+		t.Log("iteration", i)
+		nextRotation := worker.AuthRotationNextRotation.Load()
+		time.Sleep((*nextRotation).Sub(time.Now()) + 5*time.Second)

 		// Verify we see 2- after credentials have rotated, we should see current and previous
 		auths, err = workerAuthRepo.List(c.Context(), (*types.NodeInformation)(nil))
@ -130,7 +136,7 @@ func TestRotationTicking(t *testing.T) {
 		assert.Equal(priorKeyId, previousKeyId)

 		// Get workerAuthSet for this worker id and compare keys
-		workerAuthSet, err := workerAuthRepo.FindWorkerAuthByWorkerId(c.Context(), worker.PublicId)
+		workerAuthSet, err := workerAuthRepo.FindWorkerAuthByWorkerId(c.Context(), newWorker.PublicId)
 		require.NoError(err)
 		assert.NotNil(workerAuthSet)
 		assert.NotNil(workerAuthSet.Previous)