registry: add heartbeat call for running builds (#11846)

* registry: add heartbeat call for running builds

When a build is running, we send periodic heartbeats to HCP Packer's
API.

This will be used on the service side to detect if a build is stalled on
the core side because of a crash or any other malfunction that caused
Packer not to send an update on the status of a build.

* registry: only update status on status update

Prior to this commit, we'd send updates to both the labels and the
cloud-provider whenever an update to the status of a build would be
sent.

This would cause a bug in which once a build reached the post-processing
state, and its cloud-provider was set, the status could not be updated
anymore, as the cloud-provider would be set and further updates are
rejected by the platform.

To avoid this problem, we only transmit the status when doing a status
update, and no other fields along with it.

* internal: publicise CompleteBuild function

The markBuildComplete private function used to be called from a final
status update to DONE, through the UpdateBuildStatus function.

The problem being that the (now) CompleteBuild function not only updates
the status of the build, but also all its metadata.

For consistency, we remove the indirection, and explicitely call
CompleteBuild when we want to finish a build.

* internal: block status updates on DONE builds
pull/11864/head
Lucas Bajolet 4 years ago committed by GitHub
parent f752f8dd63
commit 0a41694ee5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -7,6 +7,7 @@ import (
"log"
"os"
"sync"
"time"
"github.com/hashicorp/go-multierror"
"github.com/hashicorp/hcp-sdk-go/clients/cloud-packer-service/stable/2021-04-30/models"
@ -15,6 +16,10 @@ import (
"google.golang.org/grpc/codes"
)
// HeartbeatPeriod dictates how often a heartbeat is sent to HCP to signal a
// build is still alive.
const HeartbeatPeriod = 2 * time.Minute
// Bucket represents a single Image bucket on the HCP Packer registry.
type Bucket struct {
Slug string
@ -139,9 +144,10 @@ func (b *Bucket) CreateInitialBuildForIteration(ctx context.Context, componentTy
}
// UpdateBuildStatus updates the status of a build entry on the HCP Packer registry with its current local status.
// For updating a build status to DONE use CompleteBuild.
func (b *Bucket) UpdateBuildStatus(ctx context.Context, name string, status models.HashicorpCloudPackerBuildStatus) error {
if status == models.HashicorpCloudPackerBuildStatusDONE {
return b.markBuildComplete(ctx, name)
return fmt.Errorf("do not use UpdateBuildStatus for updating to DONE")
}
buildToUpdate, err := b.Iteration.Build(name)
@ -153,13 +159,17 @@ func (b *Bucket) UpdateBuildStatus(ctx context.Context, name string, status mode
return fmt.Errorf("the build for the component %q does not have a valid id", name)
}
if buildToUpdate.Status == models.HashicorpCloudPackerBuildStatusDONE {
return fmt.Errorf("cannot modify status of DONE build %s", name)
}
_, err = b.client.UpdateBuild(ctx,
buildToUpdate.ID,
buildToUpdate.RunUUID,
buildToUpdate.CloudProvider,
"",
"",
buildToUpdate.Labels,
"",
nil,
status,
nil,
)
@ -171,10 +181,10 @@ func (b *Bucket) UpdateBuildStatus(ctx context.Context, name string, status mode
return nil
}
// markBuildComplete should be called to set a build on the HCP Packer registry to DONE.
// CompleteBuild should be called to set a build on the HCP Packer registry to DONE.
// Upon a successful call markBuildComplete will publish all images created by the named build,
// and set the registry build to done. A build with no images can not be set to DONE.
func (b *Bucket) markBuildComplete(ctx context.Context, name string) error {
func (b *Bucket) CompleteBuild(ctx context.Context, name string) error {
buildToUpdate, err := b.Iteration.Build(name)
if err != nil {
return err
@ -402,3 +412,56 @@ func (b *Bucket) IsExpectingBuildForComponent(buildName string) bool {
return build.IsNotDone()
}
// HeartbeatBuild periodically sends status updates for the build
//
// This lets HCP infer that a build is still running and should not be marked
// as cancelled by the HCP Packer registry service.
//
// Usage: defer (b.HeartbeatBuild(ctx, build, period))()
func (b *Bucket) HeartbeatBuild(ctx context.Context, build string) (func(), error) {
buildToUpdate, err := b.Iteration.Build(build)
if err != nil {
return nil, err
}
heartbeatChan := make(chan struct{})
go func() {
log.Printf("[TRACE] starting heartbeats")
tick := time.NewTicker(HeartbeatPeriod)
outHeartbeats:
for {
select {
case <-heartbeatChan:
tick.Stop()
break outHeartbeats
case <-ctx.Done():
tick.Stop()
break outHeartbeats
case <-tick.C:
_, err = b.client.UpdateBuild(ctx,
buildToUpdate.ID,
buildToUpdate.RunUUID,
"",
"",
"",
nil,
models.HashicorpCloudPackerBuildStatusRUNNING,
nil,
)
if err != nil {
log.Printf("[ERROR] failed to send heartbeat for build %q: %s", build, err)
} else {
log.Printf("[TRACE] updating build status for %q to running", build)
}
}
}
log.Printf("[TRACE] stopped heartbeating build %s", build)
}()
return func() {
close(heartbeatChan)
}, nil
}

@ -50,6 +50,14 @@ func (b *RegistryBuilder) Run(ctx context.Context, ui packersdk.Ui, hook packers
log.Printf("[TRACE] failed to update HCP Packer registry status for %q: %s", b.Name, err)
}
cleanupHeartbeat, err := b.ArtifactMetadataPublisher.HeartbeatBuild(ctx, b.Name)
if err != nil {
log.Printf("[ERROR] failed to start heartbeat function")
}
if cleanupHeartbeat != nil {
defer cleanupHeartbeat()
}
ui.Say(fmt.Sprintf("Publishing build details for %s to the HCP Packer registry", b.Name))
artifact, err := b.Builder.Run(ctx, ui, hook)
if err != nil {

@ -39,7 +39,7 @@ func (p *RegistryPostProcessor) PostProcess(ctx context.Context, ui packersdk.Ui
// This is a bit of a hack for now to denote that this pp should just update the state of a build in the Packer registry.
// TODO create an actual post-processor that we can embed here that will do the updating and printing.
if p.PostProcessor == nil {
if parErr := p.ArtifactMetadataPublisher.UpdateBuildStatus(ctx, p.BuilderType, models.HashicorpCloudPackerBuildStatusDONE); parErr != nil {
if parErr := p.ArtifactMetadataPublisher.CompleteBuild(ctx, p.BuilderType); parErr != nil {
err := fmt.Errorf("[TRACE] failed to update Packer registry with image artifacts for %q: %s", p.BuilderType, parErr)
return nil, false, true, err
}
@ -53,6 +53,26 @@ func (p *RegistryPostProcessor) PostProcess(ctx context.Context, ui packersdk.Ui
return r, true, false, nil
}
// Bump build status first so we don't end-up chaining post-processors
// that don't heartbeat, hence letting too long happen between two
// refreshes, and letting the build go to the FAILED status.
err := p.ArtifactMetadataPublisher.UpdateBuildStatus(
ctx,
p.BuilderType,
models.HashicorpCloudPackerBuildStatusRUNNING,
)
if err != nil {
log.Printf("[TRACE] failed to heartbeat running build %s: %s", p.BuilderType, err)
}
cleanupHeartbeat, err := p.ArtifactMetadataPublisher.HeartbeatBuild(ctx, p.BuilderType)
if err != nil {
log.Printf("[ERROR] failed to start heartbeat function")
}
if cleanupHeartbeat != nil {
defer cleanupHeartbeat()
}
source, keep, override, err := p.PostProcessor.PostProcess(ctx, ui, source)
if err != nil {
if parErr := p.ArtifactMetadataPublisher.UpdateBuildStatus(ctx, p.BuilderType, models.HashicorpCloudPackerBuildStatusFAILED); parErr != nil {

Loading…
Cancel
Save