From b8edeb5f828b4614716252ac9087d7ef11f37776 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Mon, 8 Jun 2026 21:03:26 +0530 Subject: [PATCH] fix(deploy): mark broken-image deploys failed on ProgressDeadlineExceeded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit deploymentStatus mapped a rollout that exceeded its progress deadline with no available replica (pods created but containers can't start: CreateContainerError "no command specified", ImagePullBackOff, CrashLoopBackOff) to "deploying" forever — it only checked DeploymentReplicaFailure + replica counts. Add a Progressing=False/ProgressDeadlineExceeded -> failed branch (after the healthy check, so a partially-failed redeploy whose old ReplicaSet still serves stays healthy). Mirrors the worker's deploy_status_reconcile fix. - new FailureReason StartFailed (+ hint) for the create/run-container-error class; added to the exhaustive knownReasons hint test. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/models/deployment_event.go | 9 ++++ internal/models/deployment_failure_hints.go | 5 ++ internal/models/deployment_failure_test.go | 1 + internal/providers/compute/k8s/client.go | 43 ++++++++++++++++- .../providers/compute/k8s/coverage_test.go | 47 +++++++++++++++++++ 5 files changed, 104 insertions(+), 1 deletion(-) diff --git a/internal/models/deployment_event.go b/internal/models/deployment_event.go index aa96cd2f..fbd8f1f9 100644 --- a/internal/models/deployment_event.go +++ b/internal/models/deployment_event.go @@ -61,6 +61,15 @@ const ( // (10-minute deadline in runDeploy / waitForJobComplete). FailureReasonDeadlineExceeded = "DeadlineExceeded" + // FailureReasonStartFailed means k8s created the app's pod but the + // container could not start — the runtime "CreateContainerError", + // "CreateContainerConfigError", or "RunContainerError" waiting reasons. + // The modal cause is a built image with no CMD/ENTRYPOINT ("no command + // specified") or an invalid container configuration. Distinct from + // ImagePullBackOff (image unreachable) and CrashLoopBackOff (image runs + // then exits non-zero): here the container is never successfully created. + FailureReasonStartFailed = "StartFailed" + // FailureReasonError covers transient k8s API errors and generic // "ReplicaFailure" conditions that don't map to a more specific reason. FailureReasonError = "Error" diff --git a/internal/models/deployment_failure_hints.go b/internal/models/deployment_failure_hints.go index 3c63d01e..30fe968c 100644 --- a/internal/models/deployment_failure_hints.go +++ b/internal/models/deployment_failure_hints.go @@ -39,6 +39,11 @@ var FailureHint = map[string]string{ "Large base images or slow package installs can cause this. " + "Try a smaller base image (e.g. alpine) and pre-install dependencies in the Dockerfile.", + FailureReasonStartFailed: "Kubernetes created your app's pod but the container could not start. " + + "The most common cause is a built image with no CMD/ENTRYPOINT (nothing to run) " + + "or an invalid container configuration. Make sure your Dockerfile ends with a " + + "CMD or ENTRYPOINT instruction, then re-deploy.", + FailureReasonError: "A Kubernetes replica failure was detected. " + "This is often a transient scheduling or resource constraint. " + "Re-deploy to retry; if it persists, check your Dockerfile for correct CMD/ENTRYPOINT.", diff --git a/internal/models/deployment_failure_test.go b/internal/models/deployment_failure_test.go index 79b411a0..f0e04999 100644 --- a/internal/models/deployment_failure_test.go +++ b/internal/models/deployment_failure_test.go @@ -21,6 +21,7 @@ var knownReasons = []string{ FailureReasonCrashLoopBackOff, FailureReasonBuildFailed, FailureReasonDeadlineExceeded, + FailureReasonStartFailed, FailureReasonError, FailureReasonUnknown, } diff --git a/internal/providers/compute/k8s/client.go b/internal/providers/compute/k8s/client.go index 4a312a78..1ed9b65d 100644 --- a/internal/providers/compute/k8s/client.go +++ b/internal/providers/compute/k8s/client.go @@ -2451,19 +2451,46 @@ func deployIngressURL(appID string) string { return scheme + "://" + appID + "." + domain } +// progressDeadlineExceededReason is the Reason k8s stamps on a Deployment's +// Progressing condition (status=False) when a rollout fails to make progress +// within spec.progressDeadlineSeconds (default 600s). k8s does not export it as +// a typed constant (it lives in the deployment controller as +// deploymentutil.TimedOutReason), so we name it here per the no-hardcoded- +// strings rule. `kubectl rollout status` treats this exact reason as a failed +// rollout. +const progressDeadlineExceededReason = "ProgressDeadlineExceeded" + // deploymentStatus translates k8s Deployment conditions and replica counts into // one of: building|deploying|healthy|failed|stopped. func deploymentStatus(deploy *appsv1.Deployment) string { - // Check for failure conditions first. + // Replica-creation failure first (the ReplicaSet could not create pods: + // quota exhausted, forbidden, etc.) — terminal. for _, cond := range deploy.Status.Conditions { if cond.Type == appsv1.DeploymentReplicaFailure && cond.Status == corev1.ConditionTrue { return "failed" } } + // At least one replica serving → healthy. Checked BEFORE the progress- + // deadline failure below so a partially-failed *redeploy* whose previous + // ReplicaSet still serves is reported healthy, not failed. if deploy.Status.AvailableReplicas >= 1 { return "healthy" } + + // Rollout exceeded its progress deadline with NO available replica: the + // pods were created but their containers cannot start — the modal cause is + // a broken built image (CreateContainerError "no command specified", + // ImagePullBackOff, or CrashLoopBackOff). k8s does NOT retry past the + // deadline, so this is terminal. Without this branch such a deploy reports + // "deploying" forever (UnavailableReplicas>0 below) and never reaches a + // terminal state — the silent runtime-deploy-failure class (twin of the + // build-Job-failed fix). Kept in sync with the worker's + // deploy_status_reconcile.deploymentStatusFromK8s. + if deploymentProgressDeadlineExceeded(deploy) { + return "failed" + } + if deploy.Status.UpdatedReplicas > 0 || deploy.Status.UnavailableReplicas > 0 { return "deploying" } @@ -2471,6 +2498,20 @@ func deploymentStatus(deploy *appsv1.Deployment) string { return "building" } +// deploymentProgressDeadlineExceeded reports whether the Deployment's +// Progressing condition is False with reason ProgressDeadlineExceeded — k8s's +// definitive "this rollout will not make progress" verdict. +func deploymentProgressDeadlineExceeded(deploy *appsv1.Deployment) bool { + for _, cond := range deploy.Status.Conditions { + if cond.Type == appsv1.DeploymentProgressing && + cond.Status == corev1.ConditionFalse && + cond.Reason == progressDeadlineExceededReason { + return true + } + } + return false +} + // maxExtractedTarBytes caps the total uncompressed size extractTarGz will // write. A crafted gzip bomb compresses to a few KB but expands to gigabytes; // without a ceiling that fills the extraction volume. 512 MiB is comfortably diff --git a/internal/providers/compute/k8s/coverage_test.go b/internal/providers/compute/k8s/coverage_test.go index f63532af..6d35f10f 100644 --- a/internal/providers/compute/k8s/coverage_test.go +++ b/internal/providers/compute/k8s/coverage_test.go @@ -189,6 +189,53 @@ func TestDeploymentStatus(t *testing.T) { if got := deploymentStatus(failed); got != "failed" { t.Errorf("failed = %q", got) } + + // ProgressDeadlineExceeded with NO available replica → failed. This is the + // silent runtime-deploy-failure case: pods created but the container can't + // start (broken image / no CMD), Progressing=False, UnavailableReplicas>0. + // Pre-fix this mapped to "deploying" forever. + progressTimeout := &appsv1.Deployment{Status: appsv1.DeploymentStatus{ + UnavailableReplicas: 1, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionFalse, + Reason: progressDeadlineExceededReason, + }, + }, + }} + if got := deploymentStatus(progressTimeout); got != "failed" { + t.Errorf("progress-deadline-exceeded = %q, want failed", got) + } + + // A serving deployment (AvailableReplicas>=1) whose newest rollout timed + // out (e.g. a failed redeploy that left the previous ReplicaSet serving) + // stays healthy — the available-replica check precedes the deadline check. + healthyDespiteTimeout := &appsv1.Deployment{Status: appsv1.DeploymentStatus{ + AvailableReplicas: 1, + Conditions: []appsv1.DeploymentCondition{ + { + Type: appsv1.DeploymentProgressing, + Status: corev1.ConditionFalse, + Reason: progressDeadlineExceededReason, + }, + }, + }} + if got := deploymentStatus(healthyDespiteTimeout); got != "healthy" { + t.Errorf("healthy-despite-timeout = %q, want healthy", got) + } + + // Progressing=True (rollout still within its deadline) must NOT be read as + // a deadline failure — it stays deploying. + progressingOK := &appsv1.Deployment{Status: appsv1.DeploymentStatus{ + UnavailableReplicas: 1, + Conditions: []appsv1.DeploymentCondition{ + {Type: appsv1.DeploymentProgressing, Status: corev1.ConditionTrue, Reason: "ReplicaSetUpdated"}, + }, + }} + if got := deploymentStatus(progressingOK); got != "deploying" { + t.Errorf("progressing-ok = %q, want deploying", got) + } } // ── Tarball extraction ───────────────────────────────────────────────────────