diff --git a/internal/daemon/controller/controller.go b/internal/daemon/controller/controller.go index 9d7578382b..8d08648777 100644 --- a/internal/daemon/controller/controller.go +++ b/internal/daemon/controller/controller.go @@ -144,6 +144,7 @@ type Controller struct { } func New(ctx context.Context, conf *Config) (*Controller, error) { + const op = "controller.New" metric.InitializeApiCollectors(conf.PrometheusRegisterer) c := &Controller{ conf: conf, @@ -395,7 +396,7 @@ func New(ctx context.Context, conf *Config) (*Controller, error) { } _, err = server.RotateRoots(ctx, serversRepo, nodeenrollment.WithCertificateLifetime(conf.TestOverrideWorkerAuthCaCertificateLifetime)) if err != nil { - return nil, fmt.Errorf("unable to ensure worker auth roots exist: %w", err) + event.WriteSysEvent(ctx, op, "unable to ensure worker auth roots exist, may be due to multiple controllers starting at once, continuing") } if downstreamersFactory != nil { diff --git a/internal/daemon/controller/testing.go b/internal/daemon/controller/testing.go index 13bc92d0ed..fcf9f118b8 100644 --- a/internal/daemon/controller/testing.go +++ b/internal/daemon/controller/testing.go @@ -473,6 +473,9 @@ type TestControllerOpts struct { // The amount of time between the scheduler waking up to run it's registered // jobs. SchedulerRunJobInterval time.Duration + + // The time to use for CA certificate lifetime for worker auth + WorkerAuthCaCertificateLifetime time.Duration } func NewTestController(t testing.TB, opts *TestControllerOpts) *TestController { @@ -774,6 +777,7 @@ func TestControllerConfig(t testing.TB, ctx context.Context, tc *TestController, RawConfig: opts.Config, Server: tc.b, DisableAuthorizationFailures: opts.DisableAuthorizationFailures, + TestOverrideWorkerAuthCaCertificateLifetime: opts.WorkerAuthCaCertificateLifetime, } } diff --git a/internal/daemon/worker/auth_rotation.go b/internal/daemon/worker/auth_rotation.go index 1e33fac5e1..7dcda19a7d 100644 --- a/internal/daemon/worker/auth_rotation.go +++ b/internal/daemon/worker/auth_rotation.go @@ -8,6 +8,8 @@ import ( "crypto/x509" "errors" "fmt" + "math/rand" + "sync/atomic" "time" berrors "github.com/hashicorp/boundary/internal/errors" @@ -17,6 +19,18 @@ import ( "github.com/hashicorp/nodeenrollment/types" ) +// The default time to use when we encounter an error or some other reason +// we can't get a better reset time +const defaultAuthRotationResetDuration = 5 * time.Second + +// AuthRotationResetDuration allows us to view it from tests, which allows us to +// get test timing right. It'll start at 0 which will cause us to run +// immediately. +var AuthRotationResetDuration time.Duration + +// AuthRotationNextRotation is useful in tests to understand how long to sleep +var AuthRotationNextRotation atomic.Pointer[time.Time] + func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) { const op = "worker.(Worker).startAuthRotationTicking" @@ -25,22 +39,27 @@ func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) { return } - // The default time to use when we encounter an error or some other reason - // we can't get a better reset time - const defaultResetDuration = time.Hour - var resetDuration time.Duration // will start at 0, so we run immediately - timer := time.NewTimer(defaultResetDuration) + // Get a valid value in + startNow := time.Now() + AuthRotationNextRotation.Store(&startNow) + + timer := time.NewTimer(defaultAuthRotationResetDuration) lastRotation := time.Time{}.UTC() for { - // You're not supposed to call reset on timers that haven't been stopped or - // expired, so we stop it here and reset it to the current resetDuration. That way if - // we want to adjust time, e.g. for tests, we can set the resetDuration - // value from within the loop + // Per the example in the docs, if you stop a timer you should drain the + // channel if it returns false. However, their example of blindly + // reading from the channel can deadlock. So we just do a select here to + // be safer. timer.Stop() + select { + case <-timer.C: + default: + } + if w.TestOverrideAuthRotationPeriod != 0 { - resetDuration = w.TestOverrideAuthRotationPeriod + AuthRotationResetDuration = w.TestOverrideAuthRotationPeriod } - timer.Reset(resetDuration) + timer.Reset(AuthRotationResetDuration) select { case <-cancelCtx.Done(): @@ -48,7 +67,10 @@ func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) { return case <-timer.C: - resetDuration = defaultResetDuration + // Add some jitter in case there is some issue to prevent thundering + // herd + jitter := time.Duration(rand.Intn(6)) * time.Second + AuthRotationResetDuration = defaultAuthRotationResetDuration + jitter // Check if it's time to rotate and if not don't do anything currentNodeCreds, err := types.LoadNodeCredentials( @@ -59,18 +81,15 @@ func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) { ) if err != nil { if errors.Is(err, nodeenrollment.ErrNotFound) { - // Be silent, but check again soon - resetDuration = 5 * time.Second + // Be silent as we likely haven't authorized yet continue } event.WriteError(cancelCtx, op, err) - resetDuration = 5 * time.Second continue } if currentNodeCreds == nil { event.WriteSysEvent(cancelCtx, op, "no error loading worker pki auth creds but nil creds, skipping rotation") - resetDuration = 5 * time.Second continue } @@ -81,7 +100,6 @@ func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) { // again fairly quickly because this may also be prior to the // first rotation and we want to ensure the validity period is // checked soon. - resetDuration = 5 * time.Second continue } @@ -167,21 +185,22 @@ func (w *Worker) startAuthRotationTicking(cancelCtx context.Context) { )..., ) - // See if we don't need to do anything + // See if we don't need to do anything, and if so calculate reset + // duration and loop back if !lastRotation.IsZero() && now.Before(nextRotation) { - resetDuration = lastRotation.Add(rotationInterval / 2).Sub(now) + AuthRotationResetDuration = lastRotation.Add(rotationInterval / 2).Sub(now) continue } newRotationInterval, err := rotateWorkerAuth(cancelCtx, w, currentNodeCreds) if err != nil { - resetDuration = 5 * time.Second event.WriteError(cancelCtx, op, err) continue } lastRotation = now nextRotation = lastRotation.Add(newRotationInterval / 2) - resetDuration = nextRotation.Sub(now) + AuthRotationNextRotation.Store(&nextRotation) + AuthRotationResetDuration = nextRotation.Sub(now) event.WriteSysEvent(cancelCtx, op, "worker credentials rotated", "next_rotation", nextRotation) } } diff --git a/internal/daemon/worker/auth_rotation_test.go b/internal/daemon/worker/auth_rotation_test.go index 2b6233ecde..3d9e2097d8 100644 --- a/internal/daemon/worker/auth_rotation_test.go +++ b/internal/daemon/worker/auth_rotation_test.go @@ -30,27 +30,27 @@ func TestRotationTicking(t *testing.T) { Level: hclog.Trace, }) + const rotationPeriod = 20 * time.Second + conf, err := config.DevController() require.NoError(err) c := controller.NewTestController(t, &controller.TestControllerOpts{ - Config: conf, - Logger: logger.Named("controller"), + Config: conf, + Logger: logger.Named("controller"), + WorkerAuthCaCertificateLifetime: rotationPeriod, }) t.Cleanup(c.Shutdown) - const rotationPeriod = 20 * time.Second - // names should not be set when using pki workers wConf, err := config.DevWorker() require.NoError(err) wConf.Worker.Name = "" wConf.Worker.InitialUpstreams = c.ClusterAddrs() w := worker.NewTestWorker(t, &worker.TestWorkerOpts{ - InitialUpstreams: c.ClusterAddrs(), - Logger: logger.Named("worker"), - AuthRotationPeriod: rotationPeriod, - Config: wConf, + InitialUpstreams: c.ClusterAddrs(), + Logger: logger.Named("worker"), + Config: wConf, }) t.Cleanup(w.Shutdown) @@ -76,17 +76,21 @@ func TestRotationTicking(t *testing.T) { // Decode the proto into the request and create the worker req := new(types.FetchNodeCredentialsRequest) require.NoError(proto.Unmarshal(reqBytes, req)) - worker, err := serversRepo.CreateWorker(c.Context(), &server.Worker{ + newWorker, err := serversRepo.CreateWorker(c.Context(), &server.Worker{ Worker: &store.Worker{ ScopeId: scope.Global.String(), }, }, server.WithFetchNodeCredentialsRequest(req)) require.NoError(err) - // Verify we see one authorized set of credentials now + // Wait for a short while; there will be an initial rotation of credentials + // after authentication + time.Sleep(rotationPeriod / 2) + + // Verify we see authorized credentials now auths, err = workerAuthRepo.List(c.Context(), (*types.NodeInformation)(nil)) require.NoError(err) - assert.Len(auths, 1) + assert.Len(auths, 2) // Fetch creds and store current key currNodeCreds, err := types.LoadNodeCredentials( w.Context(), @@ -103,7 +107,9 @@ func TestRotationTicking(t *testing.T) { // Now we wait and check that we see new values in the DB and different // creds on the worker after each rotation period for i := 2; i < 5; i++ { - time.Sleep(rotationPeriod) + t.Log("iteration", i) + nextRotation := worker.AuthRotationNextRotation.Load() + time.Sleep((*nextRotation).Sub(time.Now()) + 5*time.Second) // Verify we see 2- after credentials have rotated, we should see current and previous auths, err = workerAuthRepo.List(c.Context(), (*types.NodeInformation)(nil)) @@ -130,7 +136,7 @@ func TestRotationTicking(t *testing.T) { assert.Equal(priorKeyId, previousKeyId) // Get workerAuthSet for this worker id and compare keys - workerAuthSet, err := workerAuthRepo.FindWorkerAuthByWorkerId(c.Context(), worker.PublicId) + workerAuthSet, err := workerAuthRepo.FindWorkerAuthByWorkerId(c.Context(), newWorker.PublicId) require.NoError(err) assert.NotNil(workerAuthSet) assert.NotNil(workerAuthSet.Previous)