From 0a41694ee51d6a627a9a668a2ce30984c1065d65 Mon Sep 17 00:00:00 2001 From: Lucas Bajolet <105649352+lbajolet-hashicorp@users.noreply.github.com> Date: Tue, 5 Jul 2022 14:37:39 -0400 Subject: [PATCH] registry: add heartbeat call for running builds (#11846) * registry: add heartbeat call for running builds When a build is running, we send periodic heartbeats to HCP Packer's API. This will be used on the service side to detect if a build is stalled on the core side because of a crash or any other malfunction that caused Packer not to send an update on the status of a build. * registry: only update status on status update Prior to this commit, we'd send updates to both the labels and the cloud-provider whenever an update to the status of a build would be sent. This would cause a bug in which once a build reached the post-processing state, and its cloud-provider was set, the status could not be updated anymore, as the cloud-provider would be set and further updates are rejected by the platform. To avoid this problem, we only transmit the status when doing a status update, and no other fields along with it. * internal: publicise CompleteBuild function The markBuildComplete private function used to be called from a final status update to DONE, through the UpdateBuildStatus function. The problem being that the (now) CompleteBuild function not only updates the status of the build, but also all its metadata. For consistency, we remove the indirection, and explicitely call CompleteBuild when we want to finish a build. * internal: block status updates on DONE builds --- internal/registry/types.bucket.go | 73 ++++++++++++++++++++++++++++--- packer/registry_builder.go | 8 ++++ packer/registry_post_processor.go | 22 +++++++++- 3 files changed, 97 insertions(+), 6 deletions(-) diff --git a/internal/registry/types.bucket.go b/internal/registry/types.bucket.go index 34c69a9c9..25dc56ecd 100644 --- a/internal/registry/types.bucket.go +++ b/internal/registry/types.bucket.go @@ -7,6 +7,7 @@ import ( "log" "os" "sync" + "time" "github.com/hashicorp/go-multierror" "github.com/hashicorp/hcp-sdk-go/clients/cloud-packer-service/stable/2021-04-30/models" @@ -15,6 +16,10 @@ import ( "google.golang.org/grpc/codes" ) +// HeartbeatPeriod dictates how often a heartbeat is sent to HCP to signal a +// build is still alive. +const HeartbeatPeriod = 2 * time.Minute + // Bucket represents a single Image bucket on the HCP Packer registry. type Bucket struct { Slug string @@ -139,9 +144,10 @@ func (b *Bucket) CreateInitialBuildForIteration(ctx context.Context, componentTy } // UpdateBuildStatus updates the status of a build entry on the HCP Packer registry with its current local status. +// For updating a build status to DONE use CompleteBuild. func (b *Bucket) UpdateBuildStatus(ctx context.Context, name string, status models.HashicorpCloudPackerBuildStatus) error { if status == models.HashicorpCloudPackerBuildStatusDONE { - return b.markBuildComplete(ctx, name) + return fmt.Errorf("do not use UpdateBuildStatus for updating to DONE") } buildToUpdate, err := b.Iteration.Build(name) @@ -153,13 +159,17 @@ func (b *Bucket) UpdateBuildStatus(ctx context.Context, name string, status mode return fmt.Errorf("the build for the component %q does not have a valid id", name) } + if buildToUpdate.Status == models.HashicorpCloudPackerBuildStatusDONE { + return fmt.Errorf("cannot modify status of DONE build %s", name) + } + _, err = b.client.UpdateBuild(ctx, buildToUpdate.ID, buildToUpdate.RunUUID, - buildToUpdate.CloudProvider, "", "", - buildToUpdate.Labels, + "", + nil, status, nil, ) @@ -171,10 +181,10 @@ func (b *Bucket) UpdateBuildStatus(ctx context.Context, name string, status mode return nil } -// markBuildComplete should be called to set a build on the HCP Packer registry to DONE. +// CompleteBuild should be called to set a build on the HCP Packer registry to DONE. // Upon a successful call markBuildComplete will publish all images created by the named build, // and set the registry build to done. A build with no images can not be set to DONE. -func (b *Bucket) markBuildComplete(ctx context.Context, name string) error { +func (b *Bucket) CompleteBuild(ctx context.Context, name string) error { buildToUpdate, err := b.Iteration.Build(name) if err != nil { return err @@ -402,3 +412,56 @@ func (b *Bucket) IsExpectingBuildForComponent(buildName string) bool { return build.IsNotDone() } + +// HeartbeatBuild periodically sends status updates for the build +// +// This lets HCP infer that a build is still running and should not be marked +// as cancelled by the HCP Packer registry service. +// +// Usage: defer (b.HeartbeatBuild(ctx, build, period))() +func (b *Bucket) HeartbeatBuild(ctx context.Context, build string) (func(), error) { + buildToUpdate, err := b.Iteration.Build(build) + if err != nil { + return nil, err + } + + heartbeatChan := make(chan struct{}) + go func() { + log.Printf("[TRACE] starting heartbeats") + + tick := time.NewTicker(HeartbeatPeriod) + + outHeartbeats: + for { + select { + case <-heartbeatChan: + tick.Stop() + break outHeartbeats + case <-ctx.Done(): + tick.Stop() + break outHeartbeats + case <-tick.C: + _, err = b.client.UpdateBuild(ctx, + buildToUpdate.ID, + buildToUpdate.RunUUID, + "", + "", + "", + nil, + models.HashicorpCloudPackerBuildStatusRUNNING, + nil, + ) + if err != nil { + log.Printf("[ERROR] failed to send heartbeat for build %q: %s", build, err) + } else { + log.Printf("[TRACE] updating build status for %q to running", build) + } + } + } + + log.Printf("[TRACE] stopped heartbeating build %s", build) + }() + return func() { + close(heartbeatChan) + }, nil +} diff --git a/packer/registry_builder.go b/packer/registry_builder.go index 2a3221f5b..3374da4e9 100644 --- a/packer/registry_builder.go +++ b/packer/registry_builder.go @@ -50,6 +50,14 @@ func (b *RegistryBuilder) Run(ctx context.Context, ui packersdk.Ui, hook packers log.Printf("[TRACE] failed to update HCP Packer registry status for %q: %s", b.Name, err) } + cleanupHeartbeat, err := b.ArtifactMetadataPublisher.HeartbeatBuild(ctx, b.Name) + if err != nil { + log.Printf("[ERROR] failed to start heartbeat function") + } + if cleanupHeartbeat != nil { + defer cleanupHeartbeat() + } + ui.Say(fmt.Sprintf("Publishing build details for %s to the HCP Packer registry", b.Name)) artifact, err := b.Builder.Run(ctx, ui, hook) if err != nil { diff --git a/packer/registry_post_processor.go b/packer/registry_post_processor.go index 69f5e097b..25804e763 100644 --- a/packer/registry_post_processor.go +++ b/packer/registry_post_processor.go @@ -39,7 +39,7 @@ func (p *RegistryPostProcessor) PostProcess(ctx context.Context, ui packersdk.Ui // This is a bit of a hack for now to denote that this pp should just update the state of a build in the Packer registry. // TODO create an actual post-processor that we can embed here that will do the updating and printing. if p.PostProcessor == nil { - if parErr := p.ArtifactMetadataPublisher.UpdateBuildStatus(ctx, p.BuilderType, models.HashicorpCloudPackerBuildStatusDONE); parErr != nil { + if parErr := p.ArtifactMetadataPublisher.CompleteBuild(ctx, p.BuilderType); parErr != nil { err := fmt.Errorf("[TRACE] failed to update Packer registry with image artifacts for %q: %s", p.BuilderType, parErr) return nil, false, true, err } @@ -53,6 +53,26 @@ func (p *RegistryPostProcessor) PostProcess(ctx context.Context, ui packersdk.Ui return r, true, false, nil } + // Bump build status first so we don't end-up chaining post-processors + // that don't heartbeat, hence letting too long happen between two + // refreshes, and letting the build go to the FAILED status. + err := p.ArtifactMetadataPublisher.UpdateBuildStatus( + ctx, + p.BuilderType, + models.HashicorpCloudPackerBuildStatusRUNNING, + ) + if err != nil { + log.Printf("[TRACE] failed to heartbeat running build %s: %s", p.BuilderType, err) + } + + cleanupHeartbeat, err := p.ArtifactMetadataPublisher.HeartbeatBuild(ctx, p.BuilderType) + if err != nil { + log.Printf("[ERROR] failed to start heartbeat function") + } + if cleanupHeartbeat != nil { + defer cleanupHeartbeat() + } + source, keep, override, err := p.PostProcessor.PostProcess(ctx, ui, source) if err != nil { if parErr := p.ArtifactMetadataPublisher.UpdateBuildStatus(ctx, p.BuilderType, models.HashicorpCloudPackerBuildStatusFAILED); parErr != nil {