Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions internal/jobs/deploy_failure_autopsy.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ const (
workerFailureReasonCrashLoopBackOff = "CrashLoopBackOff"
workerFailureReasonBuildFailed = "BuildFailed"
workerFailureReasonDeadlineExceeded = "DeadlineExceeded"
workerFailureReasonStartFailed = "StartFailed"
workerFailureReasonError = "Error"
workerFailureReasonUnknown = "Unknown"
)
Expand Down Expand Up @@ -158,6 +159,11 @@ var workerFailureHint = map[string]string{
"Large base images or slow package installs can cause this. " +
"Try a smaller base image (e.g. alpine) and pre-install dependencies in the Dockerfile.",

workerFailureReasonStartFailed: "Kubernetes created your app's pod but the container could not start. " +
"The most common cause is a built image with no CMD/ENTRYPOINT (nothing to run) " +
"or an invalid container configuration. Make sure your Dockerfile ends with a " +
"CMD or ENTRYPOINT instruction, then re-deploy.",

workerFailureReasonError: "A Kubernetes replica failure was detected. " +
"This is often a transient scheduling or resource constraint. " +
"Re-deploy to retry; if it persists, check your Dockerfile for correct CMD/ENTRYPOINT.",
Expand Down Expand Up @@ -708,6 +714,14 @@ func extractPodFailure(pod *corev1.Pod, result *autopsyResult) {
result.event = fmt.Sprintf("ImagePullBackOff: %s", w.Message)
case "CrashLoopBackOff":
result.reason = workerFailureReasonCrashLoopBackOff
case "CreateContainerError", "CreateContainerConfigError", "RunContainerError":
// The pod was created but its container can't start — modal
// cause is a built image with no CMD/ENTRYPOINT ("no command
// specified") or an invalid container config. The container
// never runs, so there are no app logs; the waiting Message is
// the most useful diagnostic we can surface.
result.reason = workerFailureReasonStartFailed
result.event = fmt.Sprintf("%s: %s", w.Reason, w.Message)
}
}
// lastState gives us the terminated exit code even for CrashLoopBackOff.
Expand Down
23 changes: 23 additions & 0 deletions internal/jobs/deploy_failure_autopsy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ var workerKnownReasons = []string{
workerFailureReasonCrashLoopBackOff,
workerFailureReasonBuildFailed,
workerFailureReasonDeadlineExceeded,
workerFailureReasonStartFailed,
workerFailureReasonError,
workerFailureReasonUnknown,
}
Expand Down Expand Up @@ -193,6 +194,28 @@ func TestExtractPodFailure_ImagePullBackOff(t *testing.T) {
}
}

// TestExtractPodFailure_StartFailed covers the broken-image runtime case: the
// pod is created but the container can't start (CreateContainerError "no command
// specified" from a 474-byte empty image, CreateContainerConfigError, or
// RunContainerError). The reason must classify as StartFailed and the waiting
// message must be surfaced in event (the only diagnostic — there are no logs).
func TestExtractPodFailure_StartFailed(t *testing.T) {
for _, waitReason := range []string{"CreateContainerError", "CreateContainerConfigError", "RunContainerError"} {
t.Run(waitReason, func(t *testing.T) {
pod := buildPodWithWaiting(waitReason, "failed to generate spec: no command specified")
result := &autopsyResult{reason: workerFailureReasonUnknown}
extractPodFailure(pod, result)

if result.reason != workerFailureReasonStartFailed {
t.Errorf("reason = %q, want StartFailed", result.reason)
}
if result.event == "" {
t.Error("expected non-empty event carrying the waiting message")
}
})
}
}

func TestExtractPodFailure_Evicted(t *testing.T) {
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "evicted-pod"},
Expand Down
48 changes: 48 additions & 0 deletions internal/jobs/deploy_lifecycle_coverage_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,54 @@ func TestDeploymentStatusFromK8s_Matrix(t *testing.T) {
d: &appsv1.Deployment{Status: appsv1.DeploymentStatus{UnavailableReplicas: 1}},
want: deployStatusDeploying,
},
{
// Broken-image runtime silent-failure: progress deadline exceeded,
// no available replica → failed (pre-fix this was "deploying" forever).
name: "ProgressDeadlineExceeded with zero available is failed",
d: &appsv1.Deployment{
Status: appsv1.DeploymentStatus{
UnavailableReplicas: 1,
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentProgressing,
Status: corev1.ConditionFalse,
Reason: progressDeadlineExceededReason,
}},
},
},
want: deployStatusFailed,
},
{
// A serving deploy whose newest rollout timed out (failed redeploy,
// previous ReplicaSet still serving) stays healthy — available-replica
// check precedes the deadline check.
name: "ProgressDeadlineExceeded but a replica is available stays healthy",
d: &appsv1.Deployment{
Status: appsv1.DeploymentStatus{
AvailableReplicas: 1,
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentProgressing,
Status: corev1.ConditionFalse,
Reason: progressDeadlineExceededReason,
}},
},
},
want: deployStatusHealthy,
},
{
// Progressing=True (rollout within deadline) is NOT a deadline failure.
name: "Progressing True with zero available is deploying",
d: &appsv1.Deployment{
Status: appsv1.DeploymentStatus{
UnavailableReplicas: 1,
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentProgressing,
Status: corev1.ConditionTrue,
Reason: "ReplicaSetUpdated",
}},
},
},
want: deployStatusDeploying,
},
{
name: "all zeros is building",
d: &appsv1.Deployment{Status: appsv1.DeploymentStatus{}},
Expand Down
99 changes: 99 additions & 0 deletions internal/jobs/deploy_runtime_failed_metric_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
package jobs

// deploy_runtime_failed_metric_test.go — covers the runtime rollout-failure
// detector wired into computeNewStatus (broken-image silent-failure fix,
// 2026-06-08). Asserts BOTH that a ProgressDeadlineExceeded rollout maps to
// "failed" AND that instant_deploy_runtime_failed_detected_total increments for
// that path only (and NOT for a generic DeploymentReplicaFailure, which is a
// distinct cause out of this counter's scope).

import (
"context"
"testing"

sqlmock "github.com/DATA-DOG/go-sqlmock"
"github.com/prometheus/client_golang/prometheus/testutil"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"

"instant.dev/worker/internal/metrics"
)

func TestComputeNewStatus_ProgressDeadlineExceeded_FailsAndCountsMetric(t *testing.T) {
db, _, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer db.Close()

k8s := newFakeDeployStatusK8s()
// Rollout exceeded its progress deadline with no available replica —
// the broken-image runtime failure (container can't start).
k8s.objs["instant-deploy-pde|app-pde"] = &appsv1.Deployment{
Status: appsv1.DeploymentStatus{
UnavailableReplicas: 1,
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentProgressing,
Status: corev1.ConditionFalse,
Reason: progressDeadlineExceededReason,
}},
},
}
w := NewDeployStatusReconciler(db, k8s)

before := testutil.ToFloat64(
metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))

status, err := w.computeNewStatus(context.Background(), "app-pde")
if err != nil {
t.Fatalf("computeNewStatus: %v", err)
}
if status != deployStatusFailed {
t.Errorf("status = %q, want failed", status)
}

after := testutil.ToFloat64(
metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))
if after-before != 1 {
t.Errorf("DeployRuntimeFailedDetectedTotal delta = %v, want 1", after-before)
}
}

// TestComputeNewStatus_ReplicaFailure_DoesNotCountRuntimeMetric pins the
// attribution boundary: a DeploymentReplicaFailure also maps to "failed" but
// must NOT increment the runtime-progress-deadline counter (distinct cause).
func TestComputeNewStatus_ReplicaFailure_DoesNotCountRuntimeMetric(t *testing.T) {
db, _, err := sqlmock.New()
if err != nil {
t.Fatalf("sqlmock.New: %v", err)
}
defer db.Close()

k8s := newFakeDeployStatusK8s()
k8s.objs["instant-deploy-rf|app-rf"] = &appsv1.Deployment{
Status: appsv1.DeploymentStatus{
Conditions: []appsv1.DeploymentCondition{{
Type: appsv1.DeploymentReplicaFailure,
Status: corev1.ConditionTrue,
}},
},
}
w := NewDeployStatusReconciler(db, k8s)

before := testutil.ToFloat64(
metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))

status, err := w.computeNewStatus(context.Background(), "app-rf")
if err != nil {
t.Fatalf("computeNewStatus: %v", err)
}
if status != deployStatusFailed {
t.Errorf("status = %q, want failed", status)
}

after := testutil.ToFloat64(
metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline))
if after != before {
t.Errorf("replica-failure must not bump runtime counter: delta = %v", after-before)
}
}
52 changes: 51 additions & 1 deletion internal/jobs/deploy_status_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,20 @@ const (
deployStatusFailed = "failed"
deployStatusStopped = "stopped"

// progressDeadlineExceededReason is the Reason k8s stamps on a Deployment's
// Progressing condition (status=False) when a rollout fails to make progress
// within spec.progressDeadlineSeconds (default 600s). k8s does not export it
// as a typed constant (deploymentutil.TimedOutReason internally), so it is
// named here per the no-hardcoded-strings rule. Kept verbatim in sync with
// the api's k8s provider (progressDeadlineExceededReason in client.go).
progressDeadlineExceededReason = "ProgressDeadlineExceeded"

// runtimeFailReasonProgressDeadline is the bounded `reason` label on
// instant_deploy_runtime_failed_detected_total for a rollout that exceeded
// its progress deadline with no available replica (the broken-image runtime
// silent-failure class).
runtimeFailReasonProgressDeadline = "progress_deadline_exceeded"

// stuckBuildingReapMessage is stamped onto a reaped row's error_message
// (only when the api hadn't already written one) so the user-facing
// failure surface explains why the build never produced an app.
Expand Down Expand Up @@ -607,7 +621,15 @@ func (w *DeployStatusReconciler) computeNewStatus(ctx context.Context, providerI
return deployStatusBuilding, nil
}

return deploymentStatusFromK8s(deploy), nil
status := deploymentStatusFromK8s(deploy)
if status == deployStatusFailed && deploymentProgressDeadlineExceeded(deploy) {
// Runtime rollout-failure detection (broken-image silent-failure fix,
// 2026-06-08). Attribute ONLY the progress-deadline path —
// DeploymentReplicaFailure also maps to failed but is a distinct cause
// (the ReplicaSet could not create pods) and is not this counter's scope.
metrics.DeployRuntimeFailedDetectedTotal.WithLabelValues(runtimeFailReasonProgressDeadline).Inc()
}
return status, nil
}

// jobIsFailed reports whether a kaniko build Job has reached a terminal
Expand Down Expand Up @@ -674,12 +696,40 @@ func deploymentStatusFromK8s(deploy *appsv1.Deployment) string {
if deploy.Status.AvailableReplicas >= 1 {
return deployStatusHealthy
}
// Rollout exceeded its progress deadline with NO available replica: the
// pods were created but their containers cannot start — the modal cause is
// a broken built image (CreateContainerError "no command specified",
// ImagePullBackOff, or CrashLoopBackOff). k8s does NOT retry past the
// deadline, so this is terminal. Without this branch such a deploy reports
// "deploying" forever (UnavailableReplicas>0 below) and never transitions to
// failed — so the failure-autopsy (gated on newStatus==failed) never fires
// and the user gets no failure email. This is the runtime twin of the
// build-Job-failed override (jobIsFailed). Checked AFTER the healthy branch
// so a partially-failed redeploy whose previous ReplicaSet still serves is
// reported healthy, not failed. Kept in sync with the api's deploymentStatus.
if deploymentProgressDeadlineExceeded(deploy) {
return deployStatusFailed
}
if deploy.Status.UpdatedReplicas > 0 || deploy.Status.UnavailableReplicas > 0 {
return deployStatusDeploying
}
return deployStatusBuilding
}

// deploymentProgressDeadlineExceeded reports whether the Deployment's
// Progressing condition is False with reason ProgressDeadlineExceeded — k8s's
// definitive "this rollout will not make progress" verdict.
func deploymentProgressDeadlineExceeded(deploy *appsv1.Deployment) bool {
for _, cond := range deploy.Status.Conditions {
if cond.Type == appsv1.DeploymentProgressing &&
cond.Status == corev1.ConditionFalse &&
cond.Reason == progressDeadlineExceededReason {
return true
}
}
return false
}

// deployNamespaceFromProviderID derives the per-deployment namespace from the
// provider_id stored on a deployments row. provider_id = "app-<appID>";
// namespace = "instant-deploy-<appID>". Returns "" for rows whose provider_id
Expand Down
28 changes: 28 additions & 0 deletions internal/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@
// outcome="wake_failed" — a wake/scale-up attempt failed (k8s error).
// P1 if > 0: a user's app may be stuck asleep.
// outcome="scale_failed" — a scale-DOWN k8s patch failed (the row is left
// untouched and retried next tick). P2 observ.

Check warning on line 139 in internal/metrics/metrics.go

View workflow job for this annotation

GitHub Actions / typos

"observ" should be "observe".
//
// NR alert: deploy-scale-to-zero-fail.json (wake_failed > 0 → P1;
// scale_failed sustained → P2). Prom rule: DeployScaleToZeroFailures.
Expand Down Expand Up @@ -665,6 +665,34 @@
Help: "Kaniko build Jobs detected in Failed state by deploy_status_reconcile (silent-deploy-failure fix, 2026-05-30). Labelled by Job Failed-condition reason.",
}, []string{"reason"})

// ── deploy_status_reconcile — runtime rollout-failure detector (2026-06-08) ──
//
// The build-side twin of DeployJobFailedDetectedTotal. Increments when the
// reconciler flips a deployment to "failed" because the runtime k8s
// Deployment exceeded its progress deadline with NO available replica
// (Progressing=False, reason=ProgressDeadlineExceeded). This catches the
// silent RUNTIME failure class the build-Job detector misses: the build
// SUCCEEDED but the produced image cannot start (CreateContainerError "no
// command specified", ImagePullBackOff, CrashLoopBackOff). Pre-fix these
// deploys reported "deploying" forever and never autopsied or emailed.
//
// Label `reason`: bounded — currently only "progress_deadline_exceeded".
//
// NR alert (infra/newrelic/alerts/deploy-runtime-failed.json):
// sum(rate(instant_deploy_runtime_failed_detected_total[15m])) > 0
// for 15m → P1 page (user-visible recoverable: deploys are failing to
// start at runtime — likely a broken base image, a registry/pull-secret
// regression, or a platform image-build defect producing empty images).
//
// Catalog row (infra/observability/METRICS-CATALOG.md):
// instant_deploy_runtime_failed_detected_total | counter | reason | lazy
// (first observation is a real runtime-failure detection; the label is
// primed in metrics_test.go so /metrics exposes it from process start).
DeployRuntimeFailedDetectedTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "instant_deploy_runtime_failed_detected_total",
Help: "Runtime Deployments detected as failed-to-progress by deploy_status_reconcile (ProgressDeadlineExceeded with no available replica — broken-image silent-failure fix, 2026-06-08). Labelled by detection reason.",
}, []string{"reason"})

// ── deploy_failure_autopsy — capture outcome counter (PR 2, 2026-05-30) ──
//
// Increments once per captureDeploymentAutopsy call, labelled by outcome.
Expand Down
4 changes: 4 additions & 0 deletions internal/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ func TestAllMetrics_AreRegistered(t *testing.T) {
E2ECohortSweptTotal.WithLabelValues("failed").Add(0)
E2ECohortSweptTotal.WithLabelValues("skipped_not_cohort").Add(0)
DeployJobFailedDetectedTotal.WithLabelValues("BackoffLimitExceeded").Add(0)
// Prime the runtime-failure detector label so /metrics exposes it from
// process start (lazy *Vec; first real observation is a ProgressDeadlineExceeded
// detection in deploy_status_reconcile).
DeployRuntimeFailedDetectedTotal.WithLabelValues("progress_deadline_exceeded").Add(0)
// Prime all four DeployAutopsyCapturedTotal outcome label values so
// /metrics exposes them from process start (lazy emit otherwise leaves
// the panel empty until the first real autopsy fires).
Expand Down
Loading