From b8edeb5f828b4614716252ac9087d7ef11f37776 Mon Sep 17 00:00:00 2001
From: Manas Srivastava <mastermanas805@gmail.com>
Date: Mon, 8 Jun 2026 21:03:26 +0530
Subject: [PATCH] fix(deploy): mark broken-image deploys failed on
 ProgressDeadlineExceeded
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

deploymentStatus mapped a rollout that exceeded its progress deadline with no available replica (pods created but containers can't start: CreateContainerError "no command specified", ImagePullBackOff, CrashLoopBackOff) to "deploying" forever — it only checked DeploymentReplicaFailure + replica counts. Add a Progressing=False/ProgressDeadlineExceeded -> failed branch (after the healthy check, so a partially-failed redeploy whose old ReplicaSet still serves stays healthy). Mirrors the worker's deploy_status_reconcile fix.

- new FailureReason StartFailed (+ hint) for the create/run-container-error class; added to the exhaustive knownReasons hint test.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 internal/models/deployment_event.go           |  9 ++++
 internal/models/deployment_failure_hints.go   |  5 ++
 internal/models/deployment_failure_test.go    |  1 +
 internal/providers/compute/k8s/client.go      | 43 ++++++++++++++++-
 .../providers/compute/k8s/coverage_test.go    | 47 +++++++++++++++++++
 5 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/internal/models/deployment_event.go b/internal/models/deployment_event.go
index aa96cd2f..fbd8f1f9 100644
--- a/internal/models/deployment_event.go
+++ b/internal/models/deployment_event.go
@@ -61,6 +61,15 @@ const (
 	// (10-minute deadline in runDeploy / waitForJobComplete).
 	FailureReasonDeadlineExceeded = "DeadlineExceeded"
 
+	// FailureReasonStartFailed means k8s created the app's pod but the
+	// container could not start — the runtime "CreateContainerError",
+	// "CreateContainerConfigError", or "RunContainerError" waiting reasons.
+	// The modal cause is a built image with no CMD/ENTRYPOINT ("no command
+	// specified") or an invalid container configuration. Distinct from
+	// ImagePullBackOff (image unreachable) and CrashLoopBackOff (image runs
+	// then exits non-zero): here the container is never successfully created.
+	FailureReasonStartFailed = "StartFailed"
+
 	// FailureReasonError covers transient k8s API errors and generic
 	// "ReplicaFailure" conditions that don't map to a more specific reason.
 	FailureReasonError = "Error"
diff --git a/internal/models/deployment_failure_hints.go b/internal/models/deployment_failure_hints.go
index 3c63d01e..30fe968c 100644
--- a/internal/models/deployment_failure_hints.go
+++ b/internal/models/deployment_failure_hints.go
@@ -39,6 +39,11 @@ var FailureHint = map[string]string{
 		"Large base images or slow package installs can cause this. " +
 		"Try a smaller base image (e.g. alpine) and pre-install dependencies in the Dockerfile.",
 
+	FailureReasonStartFailed: "Kubernetes created your app's pod but the container could not start. " +
+		"The most common cause is a built image with no CMD/ENTRYPOINT (nothing to run) " +
+		"or an invalid container configuration. Make sure your Dockerfile ends with a " +
+		"CMD or ENTRYPOINT instruction, then re-deploy.",
+
 	FailureReasonError: "A Kubernetes replica failure was detected. " +
 		"This is often a transient scheduling or resource constraint. " +
 		"Re-deploy to retry; if it persists, check your Dockerfile for correct CMD/ENTRYPOINT.",
diff --git a/internal/models/deployment_failure_test.go b/internal/models/deployment_failure_test.go
index 79b411a0..f0e04999 100644
--- a/internal/models/deployment_failure_test.go
+++ b/internal/models/deployment_failure_test.go
@@ -21,6 +21,7 @@ var knownReasons = []string{
 	FailureReasonCrashLoopBackOff,
 	FailureReasonBuildFailed,
 	FailureReasonDeadlineExceeded,
+	FailureReasonStartFailed,
 	FailureReasonError,
 	FailureReasonUnknown,
 }
diff --git a/internal/providers/compute/k8s/client.go b/internal/providers/compute/k8s/client.go
index 4a312a78..1ed9b65d 100644
--- a/internal/providers/compute/k8s/client.go
+++ b/internal/providers/compute/k8s/client.go
@@ -2451,19 +2451,46 @@ func deployIngressURL(appID string) string {
 	return scheme + "://" + appID + "." + domain
 }
 
+// progressDeadlineExceededReason is the Reason k8s stamps on a Deployment's
+// Progressing condition (status=False) when a rollout fails to make progress
+// within spec.progressDeadlineSeconds (default 600s). k8s does not export it as
+// a typed constant (it lives in the deployment controller as
+// deploymentutil.TimedOutReason), so we name it here per the no-hardcoded-
+// strings rule. `kubectl rollout status` treats this exact reason as a failed
+// rollout.
+const progressDeadlineExceededReason = "ProgressDeadlineExceeded"
+
 // deploymentStatus translates k8s Deployment conditions and replica counts into
 // one of: building|deploying|healthy|failed|stopped.
 func deploymentStatus(deploy *appsv1.Deployment) string {
-	// Check for failure conditions first.
+	// Replica-creation failure first (the ReplicaSet could not create pods:
+	// quota exhausted, forbidden, etc.) — terminal.
 	for _, cond := range deploy.Status.Conditions {
 		if cond.Type == appsv1.DeploymentReplicaFailure && cond.Status == corev1.ConditionTrue {
 			return "failed"
 		}
 	}
 
+	// At least one replica serving → healthy. Checked BEFORE the progress-
+	// deadline failure below so a partially-failed *redeploy* whose previous
+	// ReplicaSet still serves is reported healthy, not failed.
 	if deploy.Status.AvailableReplicas >= 1 {
 		return "healthy"
 	}
+
+	// Rollout exceeded its progress deadline with NO available replica: the
+	// pods were created but their containers cannot start — the modal cause is
+	// a broken built image (CreateContainerError "no command specified",
+	// ImagePullBackOff, or CrashLoopBackOff). k8s does NOT retry past the
+	// deadline, so this is terminal. Without this branch such a deploy reports
+	// "deploying" forever (UnavailableReplicas>0 below) and never reaches a
+	// terminal state — the silent runtime-deploy-failure class (twin of the
+	// build-Job-failed fix). Kept in sync with the worker's
+	// deploy_status_reconcile.deploymentStatusFromK8s.
+	if deploymentProgressDeadlineExceeded(deploy) {
+		return "failed"
+	}
+
 	if deploy.Status.UpdatedReplicas > 0 || deploy.Status.UnavailableReplicas > 0 {
 		return "deploying"
 	}
@@ -2471,6 +2498,20 @@ func deploymentStatus(deploy *appsv1.Deployment) string {
 	return "building"
 }
 
+// deploymentProgressDeadlineExceeded reports whether the Deployment's
+// Progressing condition is False with reason ProgressDeadlineExceeded — k8s's
+// definitive "this rollout will not make progress" verdict.
+func deploymentProgressDeadlineExceeded(deploy *appsv1.Deployment) bool {
+	for _, cond := range deploy.Status.Conditions {
+		if cond.Type == appsv1.DeploymentProgressing &&
+			cond.Status == corev1.ConditionFalse &&
+			cond.Reason == progressDeadlineExceededReason {
+			return true
+		}
+	}
+	return false
+}
+
 // maxExtractedTarBytes caps the total uncompressed size extractTarGz will
 // write. A crafted gzip bomb compresses to a few KB but expands to gigabytes;
 // without a ceiling that fills the extraction volume. 512 MiB is comfortably
diff --git a/internal/providers/compute/k8s/coverage_test.go b/internal/providers/compute/k8s/coverage_test.go
index f63532af..6d35f10f 100644
--- a/internal/providers/compute/k8s/coverage_test.go
+++ b/internal/providers/compute/k8s/coverage_test.go
@@ -189,6 +189,53 @@ func TestDeploymentStatus(t *testing.T) {
 	if got := deploymentStatus(failed); got != "failed" {
 		t.Errorf("failed = %q", got)
 	}
+
+	// ProgressDeadlineExceeded with NO available replica → failed. This is the
+	// silent runtime-deploy-failure case: pods created but the container can't
+	// start (broken image / no CMD), Progressing=False, UnavailableReplicas>0.
+	// Pre-fix this mapped to "deploying" forever.
+	progressTimeout := &appsv1.Deployment{Status: appsv1.DeploymentStatus{
+		UnavailableReplicas: 1,
+		Conditions: []appsv1.DeploymentCondition{
+			{
+				Type:   appsv1.DeploymentProgressing,
+				Status: corev1.ConditionFalse,
+				Reason: progressDeadlineExceededReason,
+			},
+		},
+	}}
+	if got := deploymentStatus(progressTimeout); got != "failed" {
+		t.Errorf("progress-deadline-exceeded = %q, want failed", got)
+	}
+
+	// A serving deployment (AvailableReplicas>=1) whose newest rollout timed
+	// out (e.g. a failed redeploy that left the previous ReplicaSet serving)
+	// stays healthy — the available-replica check precedes the deadline check.
+	healthyDespiteTimeout := &appsv1.Deployment{Status: appsv1.DeploymentStatus{
+		AvailableReplicas: 1,
+		Conditions: []appsv1.DeploymentCondition{
+			{
+				Type:   appsv1.DeploymentProgressing,
+				Status: corev1.ConditionFalse,
+				Reason: progressDeadlineExceededReason,
+			},
+		},
+	}}
+	if got := deploymentStatus(healthyDespiteTimeout); got != "healthy" {
+		t.Errorf("healthy-despite-timeout = %q, want healthy", got)
+	}
+
+	// Progressing=True (rollout still within its deadline) must NOT be read as
+	// a deadline failure — it stays deploying.
+	progressingOK := &appsv1.Deployment{Status: appsv1.DeploymentStatus{
+		UnavailableReplicas: 1,
+		Conditions: []appsv1.DeploymentCondition{
+			{Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, Reason: "ReplicaSetUpdated"},
+		},
+	}}
+	if got := deploymentStatus(progressingOK); got != "deploying" {
+		t.Errorf("progressing-ok = %q, want deploying", got)
+	}
 }
 
 // ── Tarball extraction ───────────────────────────────────────────────────────