From 42e682e120f918ebfb0b93522558f0a419562070 Mon Sep 17 00:00:00 2001 From: "Sorawis Nilparuk (Bo)" Date: Thu, 15 Jan 2026 12:29:06 -0800 Subject: [PATCH] Mark expired Vault token as expired when Vault is unreachable (#6235) * add shutdown function to TestVaultServer * implement shutdown function in TestVaultServer * stop renewing vault token when token has already expired * Add comment * clean up error checks * modify credential_vault_token_renewal_revocation_view to include expiration_time * include TokenExpirationTime in clientStore * add test to validate that unexpired tokens are not set as expired * add token status validation * add expiration time validation to tests * add small comment * remove unnecessary renew from test * fix token translation uses wrong time field and add tests * add more token status validation * fix lint * move migration to 103 * remove t.Parallel() comments * remove unused setup code * use atomic for testvault.stopped implement atomic read/write for testvault * remove file created by merge conflict * make gen --- internal/credential/vault/jobs.go | 49 ++++--- internal/credential/vault/jobs_test.go | 135 +++++++++++++++--- internal/credential/vault/private_store.go | 41 +++--- internal/credential/vault/supported.go | 12 +- internal/credential/vault/testing.go | 4 + ...vault_token_renewal_revocation_view.up.sql | 52 +++++++ 6 files changed, 226 insertions(+), 67 deletions(-) create mode 100644 internal/db/schema/migrations/oss/postgres/103/01_credential_vault_token_renewal_revocation_view.up.sql diff --git a/internal/credential/vault/jobs.go b/internal/credential/vault/jobs.go index 0013d09d8f..1b39365ca5 100644 --- a/internal/credential/vault/jobs.go +++ b/internal/credential/vault/jobs.go @@ -169,6 +169,12 @@ func (r *TokenRenewalJob) Run(ctx context.Context, _ time.Duration) error { return nil } +func isForbiddenError(err error) bool { + var respErr *vault.ResponseError + ok := errors.As(err, &respErr) + return ok && respErr.StatusCode == http.StatusForbidden +} + func (r *TokenRenewalJob) renewToken(ctx context.Context, s *clientStore) error { const op = "vault.(TokenRenewalJob).renewToken" databaseWrapper, err := r.kms.GetWrapper(ctx, s.ProjectId, kms.KeyPurposeDatabase) @@ -190,33 +196,34 @@ func (r *TokenRenewalJob) renewToken(ctx context.Context, s *clientStore) error return errors.Wrap(ctx, err, op) } - var respErr *vault.ResponseError renewedToken, err := vc.renewToken(ctx) - if ok := errors.As(err, &respErr); ok && respErr.StatusCode == http.StatusForbidden { + if err != nil { // Vault returned a 403 when attempting a renew self, the token is either expired // or malformed. Set status to "expired" so credentials created with token can be // cleaned up. - query, values := token.updateStatusQuery(ExpiredToken) - numRows, err := r.writer.Exec(ctx, query, values) - if err != nil { - return errors.Wrap(ctx, err, op) - } - if numRows != 1 { - return errors.New(ctx, errors.Unknown, op, "token expired but failed to update repo") - } - if s.TokenStatus == string(CurrentToken) { - event.WriteSysEvent(ctx, op, "Vault credential store current token has expired", "credential store id", s.PublicId) - } + // Also, check if the token has already expired based on time to avoid attempting + // to renew the expired token against an Vault server that may no longer exist. + if isForbiddenError(err) || time.Now().After(token.ExpirationTime.AsTime()) { + query, values := token.updateStatusQuery(ExpiredToken) + numRows, err := r.writer.Exec(ctx, query, values) + if err != nil { + return errors.Wrap(ctx, err, op) + } + if numRows != 1 { + return errors.New(ctx, errors.Unknown, op, "token expired but failed to update repo") + } + if s.TokenStatus == string(CurrentToken) { + event.WriteSysEvent(ctx, op, "Vault credential store current token has expired", "credential store id", s.PublicId) + } - // Set credentials associated with this token to expired as Vault will already cascade delete them - _, err = r.writer.Exec(ctx, updateCredentialStatusByTokenQuery, []any{ExpiredCredential, token.TokenHmac}) - if err != nil { - return errors.Wrap(ctx, err, op, errors.WithMsg("error updating credentials to revoked after revoking token")) + // Set credentials associated with this token to expired as Vault will already cascade delete them + _, err = r.writer.Exec(ctx, updateCredentialStatusByTokenQuery, []any{ExpiredCredential, token.TokenHmac}) + if err != nil { + return errors.Wrap(ctx, err, op, errors.WithMsg("error updating credentials to revoked after revoking token")) + } + // exit early as we mark the token as expired + return nil } - - return nil - } - if err != nil { return errors.Wrap(ctx, err, op, errors.WithMsg("unable to renew vault token")) } diff --git a/internal/credential/vault/jobs_test.go b/internal/credential/vault/jobs_test.go index bfa9cf6d19..14704278a4 100644 --- a/internal/credential/vault/jobs_test.go +++ b/internal/credential/vault/jobs_test.go @@ -155,7 +155,6 @@ func testVaultCred(t *testing.T, } func TestNewTokenRenewalJob(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting conn, _ := db.TestSetup(t, "postgres") rw := db.New(conn) wrapper := db.TestWrapper(t) @@ -239,7 +238,6 @@ func TestNewTokenRenewalJob(t *testing.T) { } func TestTokenRenewalJob_RunLimits(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() conn, _ := db.TestSetup(t, "postgres") rw := db.New(conn) @@ -325,7 +323,6 @@ func TestTokenRenewalJob_RunLimits(t *testing.T) { } func TestTokenRenewalJob_Run(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() assert, require := assert.New(t), require.New(t) @@ -435,7 +432,6 @@ func TestTokenRenewalJob_Run(t *testing.T) { } func TestTokenRenewalJob_RunExpired(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() assert, require := assert.New(t), require.New(t) @@ -500,9 +496,119 @@ func TestTokenRenewalJob_RunExpired(t *testing.T) { require.Nil(cs) } -func TestTokenRenewalJob_NextRunIn(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting +// TestTokenRenewalJob_Run_VaultUnreachableTemporarily tests that tokens are not marked +// as expired when Vault is unreachable temporarily. +func TestTokenRenewalJob_Run_VaultUnreachableTemporarily(t *testing.T) { + ctx := context.Background() + assert, require := assert.New(t), require.New(t) + + conn, _ := db.TestSetup(t, "postgres") + rw := db.New(conn) + wrapper := db.TestWrapper(t) + kmsCache := kms.TestKms(t, conn, wrapper) + sche := scheduler.TestScheduler(t, conn, wrapper, scheduler.WithRunJobsInterval(time.Second)) + _, prj := iam.TestScopes(t, iam.TestRepo(t, conn, wrapper)) + v := NewTestVaultServer(t) + + _, ct := v.CreateToken(t, WithTokenPeriod(time.Second*300)) + in, err := NewCredentialStore(prj.GetPublicId(), v.Addr, []byte(ct)) + assert.NoError(err) + require.NotNil(in) + + r, err := newTokenRenewalJob(ctx, rw, rw, kmsCache) + require.NoError(err) + + err = sche.RegisterJob(ctx, r) + require.NoError(err) + + repo, err := NewRepository(ctx, rw, rw, kmsCache, sche) + require.NoError(err) + cs, err := repo.CreateCredentialStore(ctx, in) + require.NoError(err) + tokenBeforeRenew := allocToken() + require.NoError(rw.LookupWhere(ctx, &tokenBeforeRenew, "store_id = ?", []any{cs.GetPublicId()})) + assert.True(time.Now().Before(tokenBeforeRenew.ExpirationTime.AsTime())) + assert.Equal(string(CurrentToken), tokenBeforeRenew.Status) + // Shutdown Vault server to make vault unreachable + v.Shutdown(t) + + // Renewal will fail because Vault is unreachable but job does not return an error + err = r.Run(ctx, 0) + require.NoError(err) + + // Verify token was not expired in repo + tokenAfterFailedRenew := allocToken() + require.NoError(rw.LookupWhere(ctx, &tokenAfterFailedRenew, "store_id = ?", []any{cs.GetPublicId()})) + // expiration time is still in the future and token should still be 'current' + assert.True(time.Now().Before(tokenAfterFailedRenew.ExpirationTime.AsTime())) + // expiration time should remain the same since renewal failed + assert.Equal(tokenBeforeRenew.ExpirationTime, tokenAfterFailedRenew.ExpirationTime) + assert.Equal(string(CurrentToken), tokenAfterFailedRenew.Status) +} + +// TestTokenRenewalJob_RunExpired_VaultUnreachable tests token renewal logic when the Vault server becomes unreachable. +// The job should stop attempting to renew the token once the token is expired since the renewal will never +// be successful and there is no guarantee that Vault server will ever be reachable again. +func TestTokenRenewalJob_RunExpired_VaultUnreachablePermanently(t *testing.T) { + ctx := context.Background() + assert, require := assert.New(t), require.New(t) + + conn, _ := db.TestSetup(t, "postgres") + rw := db.New(conn) + wrapper := db.TestWrapper(t) + kmsCache := kms.TestKms(t, conn, wrapper) + sche := scheduler.TestScheduler(t, conn, wrapper, scheduler.WithRunJobsInterval(time.Second)) + _, prj := iam.TestScopes(t, iam.TestRepo(t, conn, wrapper)) + v := NewTestVaultServer(t) + // Create 2s token so it expires in vault before we can renew it + _, ct := v.CreateToken(t, WithTokenPeriod(time.Second*2)) + + in, err := NewCredentialStore(prj.GetPublicId(), v.Addr, []byte(ct)) + assert.NoError(err) + require.NotNil(in) + + r, err := newTokenRenewalJob(ctx, rw, rw, kmsCache) + require.NoError(err) + + err = sche.RegisterJob(ctx, r) + require.NoError(err) + + repo, err := NewRepository(ctx, rw, rw, kmsCache, sche) + require.NoError(err) + cs, err := repo.CreateCredentialStore(ctx, in) + require.NoError(err) + + tokenBeforeRenew := allocToken() + require.NoError(rw.LookupWhere(ctx, &tokenBeforeRenew, "store_id = ?", []any{cs.GetPublicId()})) + // expiration time is in the future + assert.True(tokenBeforeRenew.ExpirationTime.AsTime().After(time.Now())) + assert.Equal(string(CurrentToken), tokenBeforeRenew.Status) + + err = r.Run(ctx, 0) + require.NoError(err) + tokenAfterSuccessfulRenew := allocToken() + require.NoError(rw.LookupWhere(ctx, &tokenAfterSuccessfulRenew, "store_id = ?", []any{cs.GetPublicId()})) + // successful renewal should have updated expiration time + assert.True(tokenAfterSuccessfulRenew.ExpirationTime.AsTime().After(tokenBeforeRenew.ExpirationTime.AsTime())) + assert.Equal(string(CurrentToken), tokenAfterSuccessfulRenew.Status) + + // Shutdown Vault server to make vault unreachable + v.Shutdown(t) + // Sleep to move clock and expire token + time.Sleep(time.Second * 2) + + // Renewal should fail, job does not return an error when renewal fails (emits error event) + err = r.Run(ctx, 0) + require.NoError(err) + + // token should be marked as expired in the repo since the expiration time has passed + tokenAfterFailedRenew := allocToken() + require.NoError(rw.LookupWhere(ctx, &tokenAfterFailedRenew, "store_id = ?", []any{cs.GetPublicId()})) + assert.Equal(string(ExpiredToken), tokenAfterFailedRenew.Status) +} + +func TestTokenRenewalJob_NextRunIn(t *testing.T) { ctx := context.Background() conn, _ := db.TestSetup(t, "postgres") @@ -629,7 +735,6 @@ func TestTokenRenewalJob_NextRunIn(t *testing.T) { } func TestNewTokenRevocationJob(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting conn, _ := db.TestSetup(t, "postgres") rw := db.New(conn) wrapper := db.TestWrapper(t) @@ -713,8 +818,6 @@ func TestNewTokenRevocationJob(t *testing.T) { } func TestTokenRevocationJob_RunLimits(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting - ctx := context.Background() conn, _ := db.TestSetup(t, "postgres") @@ -803,7 +906,6 @@ func TestTokenRevocationJob_RunLimits(t *testing.T) { } func TestTokenRevocationJob_Run(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() assert, require := assert.New(t), require.New(t) @@ -942,7 +1044,6 @@ func TestTokenRevocationJob_Run(t *testing.T) { } func TestNewCredentialRenewalJob(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting conn, _ := db.TestSetup(t, "postgres") rw := db.New(conn) wrapper := db.TestWrapper(t) @@ -1026,7 +1127,6 @@ func TestNewCredentialRenewalJob(t *testing.T) { } func TestCredentialRenewalJob_RunLimits(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() conn, _ := db.TestSetup(t, "postgres") rw := db.New(conn) @@ -1142,7 +1242,6 @@ func TestCredentialRenewalJob_RunLimits(t *testing.T) { } func TestCredentialRenewalJob_Run(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() assert, require := assert.New(t), require.New(t) @@ -1253,7 +1352,6 @@ func TestCredentialRenewalJob_Run(t *testing.T) { } func TestCredentialRenewalJob_RunExpired(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() assert, require := assert.New(t), require.New(t) @@ -1333,7 +1431,6 @@ func TestCredentialRenewalJob_RunExpired(t *testing.T) { } func TestCredentialRenewalJob_NextRunIn(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() conn, _ := db.TestSetup(t, "postgres") rw := db.New(conn) @@ -1491,7 +1588,6 @@ func TestCredentialRenewalJob_NextRunIn(t *testing.T) { } func TestNewCredentialRevocationJob(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting conn, _ := db.TestSetup(t, "postgres") rw := db.New(conn) wrapper := db.TestWrapper(t) @@ -1575,7 +1671,6 @@ func TestNewCredentialRevocationJob(t *testing.T) { } func TestCredentialRevocationJob_RunLimits(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() conn, _ := db.TestSetup(t, "postgres") rw := db.New(conn) @@ -1691,7 +1786,6 @@ func TestCredentialRevocationJob_RunLimits(t *testing.T) { } func TestCredentialRevocationJob_Run(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() assert, require := assert.New(t), require.New(t) @@ -1786,7 +1880,6 @@ func TestCredentialRevocationJob_Run(t *testing.T) { } func TestCredentialRevocationJob_RunDeleted(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() assert, require := assert.New(t), require.New(t) @@ -1896,7 +1989,6 @@ func TestCredentialRevocationJob_RunDeleted(t *testing.T) { } func TestNewCredentialStoreCleanupJob(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting conn, _ := db.TestSetup(t, "postgres") rw := db.New(conn) wrapper := db.TestWrapper(t) @@ -1980,7 +2072,6 @@ func TestNewCredentialStoreCleanupJob(t *testing.T) { } func TestCredentialStoreCleanupJob_Run(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() assert, require := assert.New(t), require.New(t) @@ -2133,7 +2224,6 @@ func TestCredentialStoreCleanupJob_Run(t *testing.T) { } func TestNewCredentialCleanupJob(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting conn, _ := db.TestSetup(t, "postgres") rw := db.New(conn) @@ -2220,7 +2310,6 @@ func TestVaultJobsCorrelationId(t *testing.T) { } func TestCredentialCleanupJob_Run(t *testing.T) { - // t.Parallel() - this was causing test failures, investigate before un-commenting ctx := context.Background() assert, require := assert.New(t), require.New(t) diff --git a/internal/credential/vault/private_store.go b/internal/credential/vault/private_store.go index d3dade9e3a..57c3a01f88 100644 --- a/internal/credential/vault/private_store.go +++ b/internal/credential/vault/private_store.go @@ -43,25 +43,26 @@ func (r *Repository) lookupClientStore(ctx context.Context, publicId string) (*c // a Vault client. If the Vault token for the store is expired all token data will be null // other than the status of expired. type clientStore struct { - PublicId string `gorm:"primary_key"` - ProjectId string - DeleteTime *timestamp.Timestamp - VaultAddress string - Namespace string - CaCert []byte - TlsServerName string - TlsSkipVerify bool - WorkerFilter string - TokenHmac []byte - Token TokenSecret - CtToken []byte - TokenRenewalTime *timestamp.Timestamp - TokenKeyId string - TokenStatus string - ClientCert []byte - ClientKeyId string - ClientKey KeySecret - CtClientKey []byte + PublicId string `gorm:"primary_key"` + ProjectId string + DeleteTime *timestamp.Timestamp + VaultAddress string + Namespace string + CaCert []byte + TlsServerName string + TlsSkipVerify bool + WorkerFilter string + TokenHmac []byte + Token TokenSecret + CtToken []byte + TokenRenewalTime *timestamp.Timestamp + TokenKeyId string + TokenStatus string + TokenExpirationTime *timestamp.Timestamp + ClientCert []byte + ClientKeyId string + ClientKey KeySecret + CtClientKey []byte } func allocClientStore() *clientStore { @@ -98,7 +99,7 @@ func (ps *clientStore) token() *Token { tk.Status = ps.TokenStatus tk.CtToken = ps.CtToken tk.KeyId = ps.TokenKeyId - + tk.ExpirationTime = ps.TokenExpirationTime return tk } diff --git a/internal/credential/vault/supported.go b/internal/credential/vault/supported.go index fcc8e8fd98..50c7d2e2a8 100644 --- a/internal/credential/vault/supported.go +++ b/internal/credential/vault/supported.go @@ -50,8 +50,6 @@ func init() { } } -func gotDocker(t testing.TB) {} - func gotNewServer(t testing.TB, opt ...TestOption) *TestVaultServer { const ( serverTlsTemplate = `{ @@ -189,9 +187,17 @@ func gotNewServer(t testing.TB, opt ...TestOption) *TestVaultServer { resource, err := pool.RunWithOptions(dockerOptions) require.NoError(err) + + server.Shutdown = func(t *testing.T) { + cleanupResource(t, pool, resource) + server.stopped.Store(true) + } + if !opts.skipCleanup { t.Cleanup(func() { - cleanupResource(t, pool, resource) + if !server.stopped.Load() { + cleanupResource(t, pool, resource) + } }) } server.vaultContainer = resource diff --git a/internal/credential/vault/testing.go b/internal/credential/vault/testing.go index e4bff51289..903c164f3b 100644 --- a/internal/credential/vault/testing.go +++ b/internal/credential/vault/testing.go @@ -18,6 +18,7 @@ import ( "net/http" "path" "strings" + "sync/atomic" "testing" "time" @@ -1196,6 +1197,9 @@ type TestVaultServer struct { vaultContainer any ldapContainer any postgresContainer any + + stopped atomic.Bool + Shutdown func(t *testing.T) } // NewTestVaultServer creates and returns a TestVaultServer. Some Vault diff --git a/internal/db/schema/migrations/oss/postgres/103/01_credential_vault_token_renewal_revocation_view.up.sql b/internal/db/schema/migrations/oss/postgres/103/01_credential_vault_token_renewal_revocation_view.up.sql new file mode 100644 index 0000000000..a225ddf759 --- /dev/null +++ b/internal/db/schema/migrations/oss/postgres/103/01_credential_vault_token_renewal_revocation_view.up.sql @@ -0,0 +1,52 @@ +-- Copyright (c) HashiCorp, Inc. +-- SPDX-License-Identifier: BUSL-1.1 + +-- public.credential_vault_token_renewal_revocation source + +begin; + + drop view credential_vault_token_renewal_revocation; + + create view credential_vault_token_renewal_revocation as + with + tokens as ( + select token, -- encrypted + token_hmac, + store_id, + -- renewal time is the midpoint between the last renewal time and the expiration time + last_renewal_time + (expiration_time - last_renewal_time) / 2 as renewal_time, + key_id, + status, + expiration_time + from credential_vault_token + where status in ('current', 'maintaining', 'revoke') + ) + select store.public_id as public_id, + store.project_id as project_id, + store.vault_address as vault_address, + store.namespace as namespace, + store.ca_cert as ca_cert, + store.tls_server_name as tls_server_name, + store.tls_skip_verify as tls_skip_verify, + store.worker_filter as worker_filter, + store.delete_time as delete_time, + token.token as ct_token, -- encrypted + token.token_hmac as token_hmac, + token.renewal_time as token_renewal_time, + token.key_id as token_key_id, + token.status as token_status, + token.expiration_time as token_expiration_time, + cert.certificate as client_cert, + cert.certificate_key as ct_client_key, -- encrypted + cert.key_id as client_key_id + from credential_vault_store store + join tokens token + on store.public_id = token.store_id + left join credential_vault_client_certificate cert + on store.public_id = cert.store_id; + comment on view credential_vault_token_renewal_revocation is + 'credential_vault_token_renewal_revocation is a view where each row contains a credential store and the credential store''s data needed to connect to Vault. ' + 'The view returns a separate row for each active token in Vault (current, maintaining and revoke tokens); this view should only be used for token renewal and revocation. ' + 'Each row may contain encrypted data. This view should not be used to retrieve data which will be returned external to boundary.'; + +commit; \ No newline at end of file