Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
ef31030
Add instance restart policy
sjmiller609 May 16, 2026
365fa7d
Integrate health checks with restart policy
sjmiller609 May 16, 2026
09d0ad2
Simplify restart policy controller
sjmiller609 May 17, 2026
2acf92f
Merge healthcheck policy updates
sjmiller609 May 17, 2026
d267ebc
Add restart strategy field
sjmiller609 May 17, 2026
2faf519
Merge healthcheck updates
sjmiller609 May 17, 2026
504bbcc
Add restart policy network integration coverage
sjmiller609 May 17, 2026
e633e4a
Merge remote-tracking branch 'origin/hypeship/add-healthcheck-policy'…
sjmiller609 May 17, 2026
4b8e75d
Merge remote-tracking branch 'origin/hypeship/add-healthcheck-policy'…
sjmiller609 May 17, 2026
ace7a30
Expect restart policy lifecycle metrics
sjmiller609 May 17, 2026
057c18b
Merge remote-tracking branch 'origin/hypeship/add-healthcheck-policy'…
sjmiller609 May 17, 2026
deef24f
Update instance patch validation test
sjmiller609 May 17, 2026
47dd820
Fix restart policy event reconciliation
sjmiller609 May 17, 2026
3911281
Preserve manual restart stop status
sjmiller609 May 17, 2026
37d0062
Merge remote-tracking branch 'origin/hypeship/add-healthcheck-policy'…
sjmiller609 May 17, 2026
871c622
Merge remote-tracking branch 'origin/hypeship/add-healthcheck-policy'…
sjmiller609 May 17, 2026
cfff1fb
Address restart policy review comments
sjmiller609 May 17, 2026
b562a88
Preserve restart reason during attempts
sjmiller609 May 17, 2026
41a5ecb
Remove restart strategy field
sjmiller609 May 17, 2026
65ff54e
Clear restart status before start no-op
sjmiller609 May 17, 2026
4f6aa80
Recheck state before policy restart
sjmiller609 May 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions cmd/api/api/instances.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,13 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
Message: err.Error(),
}, nil
}
restartPolicy, err := toDomainRestartPolicy(request.Body.RestartPolicy)
if err != nil {
return oapi.CreateInstance400JSONResponse{
Code: "invalid_restart_policy",
Message: err.Error(),
}, nil
}

domainReq := instances.CreateInstanceRequest{
Name: request.Body.Name,
Expand All @@ -319,6 +326,7 @@ func (s *ApiService) CreateInstance(ctx context.Context, request oapi.CreateInst
SkipGuestAgent: request.Body.SkipGuestAgent != nil && *request.Body.SkipGuestAgent,
AutoStandby: autoStandby,
HealthCheck: healthCheck,
RestartPolicy: restartPolicy,
}
if request.Body.SnapshotPolicy != nil {
snapshotPolicy, err := toInstanceSnapshotPolicy(*request.Body.SnapshotPolicy)
Expand Down Expand Up @@ -1044,11 +1052,20 @@ func (s *ApiService) UpdateInstance(ctx context.Context, request oapi.UpdateInst
Message: err.Error(),
}, nil
}
restartPolicy, err := toDomainRestartPolicy(request.Body.RestartPolicy)
if err != nil {
return oapi.UpdateInstance400JSONResponse{
Code: "invalid_restart_policy",
Message: err.Error(),
}, nil
}

result, err := s.InstanceManager.UpdateInstance(ctx, inst.Id, instances.UpdateInstanceRequest{
Env: env,
AutoStandby: autoStandby,
HealthCheck: healthCheck,
Env: env,
AutoStandby: autoStandby,
HealthCheck: healthCheck,
RestartPolicy: restartPolicy,
RestartPolicySet: request.Body.RestartPolicy != nil,
})
if err != nil {
switch {
Expand Down Expand Up @@ -1182,6 +1199,8 @@ func instanceToOAPI(inst instances.Instance) oapi.Instance {
oapiInst.AutoStandby = toOAPIAutoStandbyPolicy(inst.AutoStandby)
oapiInst.HealthCheck = toOAPIHealthCheck(inst.HealthCheck)
oapiInst.HealthStatus = toOAPIHealthStatus(healthcheck.Snapshot(inst.HealthCheck, string(inst.State), inst.HealthCheckRuntime))
oapiInst.RestartPolicy = toOAPIRestartPolicy(inst.RestartPolicy)
oapiInst.RestartStatus = toOAPIRestartStatus(inst.RestartStatus)

// Convert volume attachments
if len(inst.Volumes) > 0 {
Expand Down
147 changes: 147 additions & 0 deletions cmd/api/api/instances_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
mw "github.com/kernel/hypeman/lib/middleware"
"github.com/kernel/hypeman/lib/oapi"
"github.com/kernel/hypeman/lib/paths"
restartpolicy "github.com/kernel/hypeman/lib/restart-policy"
"github.com/kernel/hypeman/lib/system"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
Expand Down Expand Up @@ -281,6 +282,7 @@ func (m *captureUpdateManager) UpdateInstance(ctx context.Context, id string, re
Env: req.Env,
AutoStandby: req.AutoStandby,
HealthCheck: req.HealthCheck,
RestartPolicy: req.RestartPolicy,
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
Expand All @@ -304,6 +306,7 @@ func (m *captureCreateManager) CreateInstance(ctx context.Context, req instances
Vcpus: req.Vcpus,
AutoStandby: req.AutoStandby,
HealthCheck: req.HealthCheck,
RestartPolicy: req.RestartPolicy,
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
Expand Down Expand Up @@ -705,6 +708,48 @@ func TestCreateInstance_MapsHealthCheckPolicy(t *testing.T) {
assert.Equal(t, oapi.InstanceHealthStatusStatusStarting, instance.HealthStatus.Status)
}

func TestCreateInstance_MapsRestartPolicy(t *testing.T) {
t.Parallel()

svc := newTestService(t)
origMgr := svc.InstanceManager
mockMgr := &captureCreateManager{Manager: origMgr}
svc.InstanceManager = mockMgr

policy := oapi.OnFailure
backoff := "7s"
stableAfter := "2m"
maxAttempts := 4

resp, err := svc.CreateInstance(ctx(), oapi.CreateInstanceRequestObject{
Body: &oapi.CreateInstanceRequest{
Name: "test-restart-policy",
Image: "docker.io/library/alpine:latest",
RestartPolicy: &oapi.RestartPolicy{
Policy: &policy,
Backoff: &backoff,
StableAfter: &stableAfter,
MaxAttempts: &maxAttempts,
},
},
})
require.NoError(t, err)

created, ok := resp.(oapi.CreateInstance201JSONResponse)
require.True(t, ok, "expected 201 response")
require.NotNil(t, mockMgr.lastReq)
require.NotNil(t, mockMgr.lastReq.RestartPolicy)
assert.Equal(t, restartpolicy.PolicyOnFailure, mockMgr.lastReq.RestartPolicy.Policy)
assert.Equal(t, "7s", mockMgr.lastReq.RestartPolicy.Backoff)
assert.Equal(t, "2m0s", mockMgr.lastReq.RestartPolicy.StableAfter)
assert.Equal(t, 4, mockMgr.lastReq.RestartPolicy.MaxAttempts)

instance := oapi.Instance(created)
require.NotNil(t, instance.RestartPolicy)
require.NotNil(t, instance.RestartPolicy.Policy)
assert.Equal(t, oapi.OnFailure, *instance.RestartPolicy.Policy)
}

func TestUpdateInstance_MapsEnvPatch(t *testing.T) {
t.Parallel()
svc := newTestService(t)
Expand Down Expand Up @@ -883,6 +928,108 @@ func TestUpdateInstance_MapsHealthCheckPatch(t *testing.T) {
assert.Equal(t, oapi.InstanceHealthStatusStatusUnknown, instance.HealthStatus.Status)
}

func TestUpdateInstance_MapsRestartPolicyPatch(t *testing.T) {
t.Parallel()
svc := newTestService(t)

origMgr := svc.InstanceManager
now := time.Now()
mockMgr := &captureUpdateManager{
Manager: origMgr,
result: &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-update-restart-policy",
Name: "inst-update-restart-policy",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
RestartPolicy: &restartpolicy.Policy{
Policy: restartpolicy.PolicyAlways,
Backoff: "5s",
StableAfter: "10m0s",
},
RestartStatus: restartpolicy.Status{
BlockedReason: restartpolicy.BlockedReasonManualStop,
},
},
State: instances.StateStopped,
},
}
svc.InstanceManager = mockMgr

policy := oapi.Always
resolved := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-update-restart-policy",
Name: "inst-update-restart-policy",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateStopped,
}

resp, err := svc.UpdateInstance(mw.WithResolvedInstance(ctx(), resolved.Id, resolved), oapi.UpdateInstanceRequestObject{
Id: resolved.Id,
Body: &oapi.UpdateInstanceRequest{
RestartPolicy: &oapi.RestartPolicy{Policy: &policy},
},
})
require.NoError(t, err)
updated, ok := resp.(oapi.UpdateInstance200JSONResponse)
require.True(t, ok, "expected 200 response")

require.NotNil(t, mockMgr.lastReq)
assert.True(t, mockMgr.lastReq.RestartPolicySet)
require.NotNil(t, mockMgr.lastReq.RestartPolicy)
assert.Equal(t, restartpolicy.PolicyAlways, mockMgr.lastReq.RestartPolicy.Policy)

instance := oapi.Instance(updated)
require.NotNil(t, instance.RestartPolicy)
require.NotNil(t, instance.RestartStatus)
require.NotNil(t, instance.RestartStatus.BlockedReason)
assert.Equal(t, oapi.ManualStop, *instance.RestartStatus.BlockedReason)
}

func TestUpdateInstance_RejectsInvalidRestartPolicy(t *testing.T) {
t.Parallel()
svc := newTestService(t)

origMgr := svc.InstanceManager
mockMgr := &captureUpdateManager{Manager: origMgr}
svc.InstanceManager = mockMgr

now := time.Now()
resolved := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "inst-update-restart-policy",
Name: "inst-update-restart-policy",
Image: "docker.io/library/alpine:latest",
CreatedAt: now,
HypervisorType: hypervisor.TypeCloudHypervisor,
},
State: instances.StateStopped,
}
policy := oapi.OnFailure
backoff := "0s"

resp, err := svc.UpdateInstance(mw.WithResolvedInstance(ctx(), resolved.Id, resolved), oapi.UpdateInstanceRequestObject{
Id: resolved.Id,
Body: &oapi.UpdateInstanceRequest{
RestartPolicy: &oapi.RestartPolicy{
Policy: &policy,
Backoff: &backoff,
},
},
})
require.NoError(t, err)

badReq, ok := resp.(oapi.UpdateInstance400JSONResponse)
require.True(t, ok, "expected 400 response")
assert.Equal(t, "invalid_restart_policy", badReq.Code)
assert.Nil(t, mockMgr.lastReq)
}

func TestUpdateInstance_RejectsZeroAutoStandbyIgnoreDestinationPort(t *testing.T) {
t.Parallel()
svc := newTestService(t)
Expand Down
80 changes: 80 additions & 0 deletions cmd/api/api/restart_policy.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package api

import (
"github.com/kernel/hypeman/lib/oapi"
restartpolicy "github.com/kernel/hypeman/lib/restart-policy"
"github.com/samber/lo"
)

func toDomainRestartPolicy(policy *oapi.RestartPolicy) (*restartpolicy.Policy, error) {
if policy == nil {
return nil, nil
}

out := &restartpolicy.Policy{}
if policy.Policy != nil {
out.Policy = restartpolicy.PolicyMode(*policy.Policy)
}
if policy.Backoff != nil {
out.Backoff = *policy.Backoff
}
if policy.MaxAttempts != nil {
out.MaxAttempts = *policy.MaxAttempts
}
if policy.StableAfter != nil {
out.StableAfter = *policy.StableAfter
}
normalized, err := restartpolicy.NormalizePolicy(out)
if err != nil {
return nil, err
}
return normalized, nil
}

func toOAPIRestartPolicy(policy *restartpolicy.Policy) *oapi.RestartPolicy {
if policy == nil {
return nil
}

mode := oapi.RestartPolicyPolicy(policy.Policy)
out := &oapi.RestartPolicy{
Policy: &mode,
}
if policy.Backoff != "" {
out.Backoff = lo.ToPtr(policy.Backoff)
}
if policy.MaxAttempts > 0 {
out.MaxAttempts = lo.ToPtr(policy.MaxAttempts)
}
if policy.StableAfter != "" {
out.StableAfter = lo.ToPtr(policy.StableAfter)
}
return out
}

func toOAPIRestartStatus(status restartpolicy.Status) *oapi.RestartStatus {
if status.IsZero() {
return nil
}

out := &oapi.RestartStatus{
Attempts: lo.ToPtr(status.Attempts),
}
if status.BlockedReason != "" {
reason := oapi.RestartStatusBlockedReason(status.BlockedReason)
out.BlockedReason = &reason
}
if status.LastAttemptAt != nil {
lastAttemptAt := status.LastAttemptAt.UTC()
out.LastAttemptAt = &lastAttemptAt
}
if status.NextAttemptAt != nil {
nextAttemptAt := status.NextAttemptAt.UTC()
out.NextAttemptAt = &nextAttemptAt
}
if status.LastReason != "" {
reason := oapi.RestartStatusLastReason(status.LastReason)
out.LastReason = &reason
}
return out
}
8 changes: 8 additions & 0 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,14 @@ func run() error {
return app.HealthCheckController.Run(gctx)
})
}
if restartController, ok := app.InstanceManager.(interface {
StartRestartPolicyController(context.Context) error
}); ok {
grp.Go(func() error {
logger.Info("starting restart policy controller")
return restartController.StartRestartPolicyController(gctx)
})
}

// Run the server
grp.Go(func() error {
Expand Down
8 changes: 6 additions & 2 deletions lib/healthcheck/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ Stopping, deleting, standing by, or restoring an instance stops active checks. S

## Restart Policy

Health checks only report health. They do not restart instances.
Health checks do not restart instances by themselves.

If Hypeman later adds restart-on-unhealthy behavior, it should consume `health_status=unhealthy` explicitly rather than making health checks mutate lifecycle state.
When an instance also has `restart_policy.policy=on_failure` or `restart_policy.policy=always`, an `unhealthy` health status becomes a restart-policy failure signal. The restart policy applies its normal backoff, max attempts, manual-stop suppression, and stable-window reset before Hypeman restarts the whole instance.

With `restart_policy.policy=never` or no restart policy, health checks only report status.

Health checks still do not mutate lifecycle state directly. The instance remains `Running` while unhealthy until restart policy chooses to stop and start it.
6 changes: 6 additions & 0 deletions lib/instances/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ func (m *manager) createInstance(
SnapshotPolicy: cloneSnapshotPolicy(req.SnapshotPolicy),
AutoStandby: cloneAutoStandbyPolicy(req.AutoStandby),
HealthCheck: cloneHealthCheckPolicy(req.HealthCheck),
RestartPolicy: cloneRestartPolicy(req.RestartPolicy),
}

// 12. Ensure directories
Expand Down Expand Up @@ -638,6 +639,11 @@ func validateCreateRequest(req *CreateInstanceRequest) error {
if err := validateHealthCheckCompatibility(req.HealthCheck, req.NetworkEnabled, req.SkipGuestAgent); err != nil {
return err
}
normalizedRestartPolicy, err := normalizeRestartPolicy(req.RestartPolicy)
if err != nil {
return err
}
req.RestartPolicy = normalizedRestartPolicy

// Validate volume attachments
if err := validateVolumeAttachments(req.Volumes); err != nil {
Expand Down
5 changes: 5 additions & 0 deletions lib/instances/fork.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/kernel/hypeman/lib/instances/phasetracking"
"github.com/kernel/hypeman/lib/logger"
"github.com/kernel/hypeman/lib/network"
restartpolicy "github.com/kernel/hypeman/lib/restart-policy"
"github.com/nrednav/cuid2"
"go.opentelemetry.io/otel/attribute"
"gvisor.dev/gvisor/pkg/cleanup"
Expand Down Expand Up @@ -286,6 +287,7 @@ func (m *manager) forkInstanceFromStoppedOrStandby(ctx context.Context, id strin
forkMeta.VsockSocket = m.paths.InstanceSocket(forkID, hypervisor.VsockSocketNameForType(forkMeta.HypervisorType))
forkMeta.ExitCode = nil
forkMeta.ExitMessage = ""
forkMeta.RestartStatus = restartpolicy.Status{}
// Forks are new instances; phase accounting must not inherit the source's
// cumulative durations. The first transition into the fork's runtime
// phase (Standby for snapshot forks, Stopped for stopped forks) will be
Expand Down Expand Up @@ -504,6 +506,9 @@ func cloneStoredMetadata(src StoredMetadata) StoredMetadata {
if src.HealthCheck != nil {
dst.HealthCheck = cloneHealthCheckPolicy(src.HealthCheck)
}
if src.RestartPolicy != nil {
dst.RestartPolicy = cloneRestartPolicy(src.RestartPolicy)
}
if src.SnapshotPolicy != nil {
dst.SnapshotPolicy = cloneSnapshotPolicy(src.SnapshotPolicy)
}
Expand Down
Loading
Loading