Fix flaky TestServer_ReloadInitialUpstreams (#3758)

With the KMS changes this should never have worked at all (because the
different databases means the credentials conveyed after the KMS-based
authentication would never be valid). However, it was sometimes passing
because of a race condition: the worker could be found in the database
before the new credentials were returned to the worker, causing the
worker to start at the KMS step with the second controller.

This changes the test to use the same database for two controllers, but
to only ever have one running at a time, and uses a sequence of two
changed (or unchanged, depending) status updates from the worker to
ensure that the reload was successful.
pull/3753/head
Jeff Mitchell 3 years ago committed by GitHub
parent d158c1475b
commit b64eadedbf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -11,8 +11,9 @@ import (
"testing"
"time"
"github.com/hashicorp/boundary/internal/daemon/controller"
"github.com/hashicorp/boundary/internal/server"
"github.com/hashicorp/boundary/testing/controller"
tc "github.com/hashicorp/boundary/testing/controller"
"github.com/mitchellh/cli"
"github.com/stretchr/testify/require"
"go.uber.org/atomic"
@ -51,25 +52,18 @@ func TestServer_ReloadInitialUpstreams(t *testing.T) {
recoveryWrapper, _ := wrapperWithKey(t)
workerAuthWrapper, key := wrapperWithKey(t)
// Create two controllers, each with their own database. In practice it
// would be odd to have separate databases, but it makes it easy for the
// test to assert that the worker has connected to the second controller
// after a reload signal is sent.
testController := controller.NewTestController(
testController := tc.NewTestController(
t,
controller.WithWorkerAuthKms(workerAuthWrapper),
controller.WithRootKms(rootWrapper),
controller.WithRecoveryKms(recoveryWrapper),
tc.WithWorkerAuthKms(workerAuthWrapper),
tc.WithRootKms(rootWrapper),
tc.WithRecoveryKms(recoveryWrapper),
tc.DisableDatabaseDestruction(),
)
defer testController.Shutdown()
testController2 := controller.NewTestController(
t,
controller.WithWorkerAuthKms(workerAuthWrapper),
controller.WithRootKms(rootWrapper),
controller.WithRecoveryKms(recoveryWrapper),
)
defer testController2.Shutdown()
require.NotEqual(testController.Config().DatabaseUrl, testController2.Config().DatabaseUrl)
t.Cleanup(testController.Shutdown)
testController2 := testController.AddClusterControllerMember(t, &controller.TestControllerOpts{
DisableAutoStart: true,
})
wg := &sync.WaitGroup{}
@ -91,25 +85,69 @@ func TestServer_ReloadInitialUpstreams(t *testing.T) {
t.Fatalf("timeout waiting for worker start")
}
// Wait until the worker has connected to the first controller
// Wait until the worker has connected to the first controller as seen via
// two status updates
timeout := time.NewTimer(15 * time.Second)
poll := time.NewTimer(0)
var w *server.Worker
var lastStatusTime time.Time
serversRepo, err := testController.Controller().ServersRepoFn()
require.NoError(err)
pollFirstController:
for {
select {
case <-timeout.C:
t.Fatalf("timeout wait for worker to connect to first controller")
case <-poll.C:
serversRepo, err := testController.Controller().ServersRepoFn()
require.NoError(err)
w, err = serversRepo.LookupWorkerByName(testController.Context(), "test")
require.NoError(err)
if w != nil {
timeout.Stop()
break pollFirstController
switch {
case lastStatusTime.IsZero():
lastStatusTime = w.GetLastStatusTime().AsTime().Round(time.Second)
default:
if !lastStatusTime.Equal(w.GetLastStatusTime().AsTime().Round(time.Second)) {
timeout.Stop()
break pollFirstController
}
}
}
poll.Reset(time.Second)
}
}
// Shut down first controller, start second, then ensure we are no longer
// seeing status updates.
testController.Shutdown()
require.NoError(testController2.Controller().Start())
t.Cleanup(testController2.Shutdown)
lastStatusTime = time.Time{}
timeout.Reset(15 * time.Second)
poll.Reset(0)
serversRepo, err = testController2.Controller().ServersRepoFn()
require.NoError(err)
pollForNoStatus:
for {
select {
case <-timeout.C:
// Great, didn't see it
poll.Stop()
break pollForNoStatus
case <-poll.C:
w, err = serversRepo.LookupWorkerByName(testController2.Context(), "test")
require.NoError(err)
if w != nil {
switch {
case lastStatusTime.IsZero():
lastStatusTime = w.GetLastStatusTime().AsTime().Round(time.Second)
default:
if !lastStatusTime.Equal(w.GetLastStatusTime().AsTime().Round(time.Second)) {
t.Fatal("found updated status times when not expected")
}
}
}
poll.Reset(1 * time.Millisecond)
poll.Reset(time.Second)
}
}
@ -124,21 +162,28 @@ pollFirstController:
// Wait until the worker connects to the second controller
timeout.Reset(15 * time.Second)
poll.Reset(10 * time.Millisecond)
poll.Reset(time.Second)
lastStatusTime = time.Time{}
pollSecondController:
for {
select {
case <-timeout.C:
t.Fatalf("timeout wait for worker to connect to second controller")
case <-poll.C:
serversRepo, err := testController2.Controller().ServersRepoFn()
require.NoError(err)
w, err = serversRepo.LookupWorkerByName(testController2.Context(), "test")
require.NoError(err)
if w != nil {
break pollSecondController
switch {
case lastStatusTime.IsZero():
lastStatusTime = w.GetLastStatusTime().AsTime().Round(time.Second)
default:
if !lastStatusTime.Round(time.Second).Equal(w.GetLastStatusTime().AsTime().Round(time.Second)) {
timeout.Stop()
break pollSecondController
}
}
}
poll.Reset(1 * time.Millisecond)
poll.Reset(time.Second)
}
}

@ -276,13 +276,13 @@ func (c *Controller) configureForCluster(ln *base.ServerListener) (func(), error
}
go func() {
err := splitListener.Start()
if err != nil {
if err != nil && !errors.Is(err, net.ErrClosed) {
event.WriteError(c.baseContext, op, err, event.WithInfoMsg("splitListener.Start() error"))
}
}()
go func() {
err := ln.GrpcServer.Serve(metric.InstrumentClusterTrackingListener(multiplexingAuthedListener, grpcListenerPurpose))
if err != nil {
if err != nil && !errors.Is(err, net.ErrClosed) {
event.WriteError(c.baseContext, op, err, event.WithInfoMsg("multiplexingAuthedListener error"))
}
}()

@ -810,6 +810,7 @@ func (tc *TestController) AddClusterControllerMember(t testing.TB, opts *TestCon
DefaultPassword: tc.b.DevPassword,
DisableKmsKeyCreation: true,
DisableAuthMethodCreation: true,
DisableAutoStart: opts.DisableAutoStart,
PublicClusterAddr: opts.PublicClusterAddr,
WorkerStatusGracePeriodDuration: opts.WorkerStatusGracePeriodDuration,
LivenessTimeToStaleDuration: opts.LivenessTimeToStaleDuration,

Loading…
Cancel
Save