diff --git a/internal/config/config.go b/internal/config/config.go index 8e4427f..534b87c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -5,6 +5,7 @@ import ( "fmt" "log/slog" "os" + "strconv" "strings" ) @@ -178,6 +179,15 @@ type Config struct { FlowSyntheticTier string // FLOW_SYNTHETIC_TIER — seeded tier (default free) FlowSyntheticDisabled string // FLOW_SYNTHETIC_DISABLED — comma list of per-flow kill switches FlowSyntheticJWTSecret string // JWT_SECRET — shared with api; mints the synthetic session JWT + + // Scale-to-zero idle-scaler (deploy_idle_scaler.go, Task #54). INERT unless + // DeployScaleToZeroEnabled is true — the master flag (shared name with the + // api's wake-path flag). When off, the idle-scaler sweep is a no-op (no k8s + // patch, no DB write). DeployScaleToZeroIdleMinutes is the no-activity + // threshold before an app is descheduled (default 30; floored at 5 to avoid + // pathological flapping). Enabling is an operator action after a canary. + DeployScaleToZeroEnabled bool // DEPLOY_SCALE_TO_ZERO_ENABLED — master flag (default false) + DeployScaleToZeroIdleMinutes int // DEPLOY_SCALE_TO_ZERO_IDLE_MINUTES — idle threshold (default 30) } // ErrMissingConfig is returned when a required env var is absent. @@ -294,6 +304,20 @@ func Load() *Config { FlowSyntheticTier: os.Getenv("FLOW_SYNTHETIC_TIER"), FlowSyntheticDisabled: os.Getenv("FLOW_SYNTHETIC_DISABLED"), FlowSyntheticJWTSecret: os.Getenv("JWT_SECRET"), + + // Scale-to-zero idle-scaler (Task #54). Default OFF; idle threshold + // default 30 min (parsed below). + DeployScaleToZeroEnabled: os.Getenv("DEPLOY_SCALE_TO_ZERO_ENABLED") == "true", + } + + // DEPLOY_SCALE_TO_ZERO_IDLE_MINUTES: minutes of no-activity before an app is + // descheduled. Default 30; an unset / unparseable / sub-5 value floors to 30 + // so a misconfig can't make the scaler aggressively flap apps to sleep. + cfg.DeployScaleToZeroIdleMinutes = 30 + if v := strings.TrimSpace(os.Getenv("DEPLOY_SCALE_TO_ZERO_IDLE_MINUTES")); v != "" { + if n, err := strconv.Atoi(v); err == nil && n >= 5 { + cfg.DeployScaleToZeroIdleMinutes = n + } } // Fall back to the shared object-store bucket when the operator hasn't diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 947d852..99e461e 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -126,6 +126,36 @@ func TestLoad_Defaults(t *testing.T) { } } +// TestLoad_DeployScaleToZeroIdleMinutes exercises the env-parse branch for +// DEPLOY_SCALE_TO_ZERO_IDLE_MINUTES: a valid value is honoured; an invalid / +// sub-5 value floors to the 30-minute default. +func TestLoad_DeployScaleToZeroIdleMinutes(t *testing.T) { + t.Run("valid override", func(t *testing.T) { + clearEnv(t) + t.Setenv("DATABASE_URL", "postgres://localhost/db") + t.Setenv("DEPLOY_SCALE_TO_ZERO_IDLE_MINUTES", "45") + if got := Load().DeployScaleToZeroIdleMinutes; got != 45 { + t.Errorf("DeployScaleToZeroIdleMinutes = %d; want 45", got) + } + }) + t.Run("sub-5 floors to default", func(t *testing.T) { + clearEnv(t) + t.Setenv("DATABASE_URL", "postgres://localhost/db") + t.Setenv("DEPLOY_SCALE_TO_ZERO_IDLE_MINUTES", "3") + if got := Load().DeployScaleToZeroIdleMinutes; got != 30 { + t.Errorf("sub-5 DeployScaleToZeroIdleMinutes = %d; want floor 30", got) + } + }) + t.Run("non-numeric floors to default", func(t *testing.T) { + clearEnv(t) + t.Setenv("DATABASE_URL", "postgres://localhost/db") + t.Setenv("DEPLOY_SCALE_TO_ZERO_IDLE_MINUTES", "abc") + if got := Load().DeployScaleToZeroIdleMinutes; got != 30 { + t.Errorf("non-numeric DeployScaleToZeroIdleMinutes = %d; want floor 30", got) + } + }) +} + func TestLoad_PanicsWithoutDatabaseURL(t *testing.T) { clearEnv(t) defer func() { diff --git a/internal/jobs/deploy_idle_scaler.go b/internal/jobs/deploy_idle_scaler.go new file mode 100644 index 0000000..91e5659 --- /dev/null +++ b/internal/jobs/deploy_idle_scaler.go @@ -0,0 +1,372 @@ +package jobs + +// deploy_idle_scaler.go — scale-to-zero idle descheduler (Task #54). +// +// SIBLING TO deployment_expirer.go, NOT A REPLACEMENT +// +// - deployment_expirer soft-deletes (status='expired') a deploy whose TTL +// elapsed. That is PERMANENT and tears the app down. +// - this idle-scaler patches an idle (but live) app's Deployment to +// replicas=0 — ~$0 compute, fully REVERSIBLE. The row stays 'healthy'; +// only scaled_to_zero flips true. The api wake endpoint (or a redeploy) +// brings it back. Idle ≠ expired. +// +// THE IDLE SIGNAL (v1 — stated honestly) +// +// instanode.dev serves a deployed app via a k8s Ingress that routes straight +// to the per-app Service; the api/worker processes are NOT in the request +// path, and no nginx-ingress request-total scrape is wired into the worker +// today. So the only reliable "activity" signal v1 has is the +// deployments.last_activity_at column, which is stamped at create-time and +// bumped on every deploy / redeploy / explicit wake (api migration 068). +// +// THEREFORE v1 idle = "no deploy / redeploy / wake for N minutes", NOT +// "no HTTP traffic for N minutes". This is a deliberately conservative signal: +// it will never deschedule an app the user is actively redeploying, and the +// explicit-wake path makes a wrongly-slept app one POST away from awake. The +// FOLLOW-UP to make this traffic-based is to scrape an nginx-ingress per-host +// request counter (or have the ingress bump last_activity_at) — that lifts the +// signal to true traffic-idle without changing this job's structure. +// +// FLAG-GATED, DEFAULT OFF +// +// The whole job is inert unless DEPLOY_SCALE_TO_ZERO_ENABLED is set. When off, +// Work() logs at DEBUG and returns immediately — no k8s patch, no DB write. +// Proven by TestDeployIdleScaler_FlagOffNoOp. +// +// FAIL-OPEN POSTURE +// +// Constructed with a deployScaleK8sProvider that may be nil (no cluster in CI / +// docker-compose). Work() short-circuits with a WARN when k8s is nil — the +// rest of the worker keeps running, identical to deploy_status_reconcile. +// +// PER-APP OPT-OUT +// +// always_on=true pins an app: the candidate SELECT excludes it, so a pinned +// (Pro+/operator) app never sleeps. The scale-down UPDATE is double-guarded on +// the same predicate so a concurrent pin/redeploy/wake between SELECT and +// UPDATE makes the row report 0 rows (skipped, not wrongly slept). + +import ( + "context" + "database/sql" + "fmt" + "log/slog" + "strings" + "time" + + "github.com/google/uuid" + "github.com/riverqueue/river" + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + + "instant.dev/worker/internal/metrics" +) + +const ( + // deployIdleScalerInterval is how often the sweep runs. 2 min is frequent + // enough that an app sleeps promptly after crossing the idle threshold, + // while the threshold itself (default 30 min) is what actually governs + // when descheduling happens — the tick just polls. + deployIdleScalerInterval = 2 * time.Minute + + // idleScalerBatchLimit caps how many apps one tick descheduals so a backlog + // (flag flipped on for the first time with a large idle fleet) spreads the + // k8s API load across ticks instead of one thundering burst. + idleScalerBatchLimit = 50 + + // idleScalerK8sTimeout caps a single ScaleDeployment call so one stuck + // namespace can't stall the batch. + idleScalerK8sTimeout = 5 * time.Second + + // Status / naming constants — verbatim copies of the api's canonical set + // (the worker module does not import the api module; same convention as + // deploy_status_reconcile.go). If the api strings change, update both. + idleScalerStatusHealthy = "healthy" + idleScalerProviderPfx = "app-" // provider_id = "app-" + idleScalerNSPfx = "instant-deploy-" // namespace = "instant-deploy-" +) + +// DeployIdleScalerArgs is the periodic-job payload. Empty — every run is a +// full candidate sweep. +type DeployIdleScalerArgs struct{} + +// Kind implements river.JobArgs. +func (DeployIdleScalerArgs) Kind() string { return "deploy_idle_scaler" } + +// deployScaleK8sProvider is the slice of k8s the idle-scaler needs: patch a +// Deployment's replica count. Defined as an interface so the worker can pass +// nil when no cluster is reachable and so tests inject a recording fake. +type deployScaleK8sProvider interface { + // ScaleDeployment patches spec.replicas on the named Deployment. A NotFound + // Deployment MUST be returned as apierrors.IsNotFound so the caller can + // treat a torn-down app as "skip, not fail" instead of wedging the row. + ScaleDeployment(ctx context.Context, namespace, name string, replicas int32) error +} + +// k8sDeployScaleClient is the concrete deployScaleK8sProvider, backed by a +// kubernetes.Clientset. Mirrors k8sDeployStatusClient in +// deploy_status_reconcile.go. +type k8sDeployScaleClient struct { + cs kubernetes.Interface +} + +// ScaleDeployment implements deployScaleK8sProvider via a read-modify-write +// Update (the fake clientset used in tests does not support strategic-merge +// Patch on subresources, and Update is the same idempotent shape the api's +// compute.Scale uses). +func (c *k8sDeployScaleClient) ScaleDeployment(ctx context.Context, namespace, name string, replicas int32) error { + d, err := c.cs.AppsV1().Deployments(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + return err // includes apierrors.IsNotFound — caller inspects it + } + if d.Spec.Replicas != nil && *d.Spec.Replicas == replicas { + return nil // already at target — idempotent no-op + } + r := replicas + d.Spec.Replicas = &r + _, err = c.cs.AppsV1().Deployments(namespace).Update(ctx, d, metav1.UpdateOptions{}) + return err +} + +// NewK8sDeployScaleClient builds a deployScaleK8sProvider sharing the supplied +// clientset (callers pass the same kubernetes.Interface used for the status +// reconciler so they share a connection pool). +func NewK8sDeployScaleClient(cs kubernetes.Interface) deployScaleK8sProvider { + return &k8sDeployScaleClient{cs: cs} +} + +// NewK8sDeployScaleClientFromCluster builds a scale client from in-cluster +// config (kubeconfig fallback for local dev). Returns (nil, err) when no +// cluster is reachable — StartWorkers logs and passes nil, so the idle-scaler +// short-circuits with a WARN each tick (fail-open, identical to the status +// reconciler). Reuses newDeployK8sClientset from deploy_status_reconcile.go. +func NewK8sDeployScaleClientFromCluster() (deployScaleK8sProvider, error) { + cs, err := newDeployScaleClientset() + if err != nil { + return nil, err + } + return &k8sDeployScaleClient{cs: cs}, nil +} + +// newDeployScaleClientset is a package-level indirection over +// newDeployK8sClientset so tests can override the clientset builder to exercise +// the success return of NewK8sDeployScaleClientFromCluster without a reachable +// cluster. +var newDeployScaleClientset = newDeployK8sClientset + +// compile-time assertion that the production client satisfies the interface. +var _ deployScaleK8sProvider = (*k8sDeployScaleClient)(nil) + +// compile-time assertion appsv1 is used (Get returns *appsv1.Deployment). +var _ = appsv1.Deployment{} + +// DeployIdleScaler is the River worker that descheduals idle deployments. +type DeployIdleScaler struct { + river.WorkerDefaults[DeployIdleScalerArgs] + db *sql.DB + k8s deployScaleK8sProvider // may be nil → Work warn-logs each tick + enabled bool // DEPLOY_SCALE_TO_ZERO_ENABLED + idleMinutes int // descheduling threshold +} + +// NewDeployIdleScaler constructs the worker. Pass nil for k8sProvider when the +// cluster is unreachable. enabled gates the entire job (default-off flag); +// idleMinutes is the no-activity threshold (the constructor floors it at 5). +func NewDeployIdleScaler(db *sql.DB, k8sProvider deployScaleK8sProvider, enabled bool, idleMinutes int) *DeployIdleScaler { + if idleMinutes < 5 { + idleMinutes = 30 + } + return &DeployIdleScaler{ + db: db, + k8s: k8sProvider, + enabled: enabled, + idleMinutes: idleMinutes, + } +} + +// idleCandidate is the projection the scaler reads. +type idleCandidate struct { + id uuid.UUID + providerID string +} + +// Work runs one idle sweep. +func (w *DeployIdleScaler) Work(ctx context.Context, job *river.Job[DeployIdleScalerArgs]) error { + start := time.Now() + + if !w.enabled { + // Flag OFF → fully inert. Idle-tick at DEBUG per worker convention 1. + slog.Debug("jobs.deploy_idle_scaler.disabled", + "note", "DEPLOY_SCALE_TO_ZERO_ENABLED unset; scale-to-zero is off", + "job_id", job.ID) + return nil + } + + if w.k8s == nil { + slog.Warn("jobs.deploy_idle_scaler.skipped_no_k8s_client", + "reason", "k8s client init failed at startup; idle apps will not be descheduled until the worker restarts with a reachable cluster", + "job_id", job.ID) + return nil + } + + candidates, err := w.listIdleCandidates(ctx) + if err != nil { + return fmt.Errorf("deploy_idle_scaler: list candidates: %w", err) + } + + var scaledDown, skipped, failed int + for _, c := range candidates { + ns, name := namespaceAndNameFromProviderID(c.providerID) + if ns == "" { + // provider_id not in app- shape (e.g. a stack row) — not ours. + skipped++ + continue + } + + scaleCtx, cancel := context.WithTimeout(ctx, idleScalerK8sTimeout) + scaleErr := w.k8s.ScaleDeployment(scaleCtx, ns, name, 0) + cancel() + if scaleErr != nil { + if apierrors.IsNotFound(scaleErr) { + // Deployment torn down out from under us — skip, don't flip the + // row (status reconciler / orphan sweep will reconcile it). + skipped++ + continue + } + slog.Warn("jobs.deploy_idle_scaler.scale_failed", + "id", c.id, "namespace", ns, "error", scaleErr) + metrics.DeployScaledToZeroTotal.WithLabelValues("scale_failed").Inc() + failed++ + continue + } + + // DB half: CAS-flip scaled_to_zero=true. Double-guarded on the same + // predicate as the SELECT so a row that raced into a non-eligible state + // (woken, pinned, redeployed, expired) between SELECT and UPDATE is left + // alone — k8s was already patched to 0, but a concurrent wake re-scales + // to 1 and the next tick re-evaluates, so we never strand it. + n, dbErr := w.markScaledToZero(ctx, c.id) + if dbErr != nil { + slog.Error("jobs.deploy_idle_scaler.db_flip_failed", + "id", c.id, "error", dbErr) + metrics.DeployScaledToZeroTotal.WithLabelValues("scale_failed").Inc() + failed++ + continue + } + if n == 0 { + skipped++ + continue + } + metrics.DeployScaledToZeroTotal.WithLabelValues("scaled_down").Inc() + slog.Info("jobs.deploy_idle_scaler.scaled_down", + "id", c.id, "namespace", ns, + "idle_threshold_min", w.idleMinutes) + scaledDown++ + } + + // Sample the asleep-fleet gauge regardless of whether we scaled anything + // this tick (so the tile stays accurate even on quiet ticks). + if asleep, gErr := w.countAsleep(ctx); gErr == nil { + metrics.DeployIdleApps.Set(float64(asleep)) + } else { + slog.Warn("jobs.deploy_idle_scaler.gauge_sample_failed", "error", gErr) + } + + if scaledDown == 0 && failed == 0 { + // Idle tick (nothing descheduled, nothing failed) → DEBUG per convention. + slog.Debug("jobs.deploy_idle_scaler.completed", + "candidates", len(candidates), "scaled_down", 0, "skipped", skipped, + "duration_ms", time.Since(start).Milliseconds(), "job_id", job.ID) + return nil + } + slog.Info("jobs.deploy_idle_scaler.completed", + "candidates", len(candidates), "scaled_down", scaledDown, + "skipped", skipped, "failed", failed, + "duration_ms", time.Since(start).Milliseconds(), "job_id", job.ID) + return nil +} + +// listIdleCandidates returns healthy, not-already-zeroed, not-pinned +// deployments whose last_activity_at is older than the idle threshold. NULL +// last_activity_at rows (legacy, pre-068-backfill edge) are NOT selected — +// migration 068 backfills them, but a defensive NULL-excluding predicate means +// a row with no activity stamp is never descheduled blind. +func (w *DeployIdleScaler) listIdleCandidates(ctx context.Context) ([]idleCandidate, error) { + cutoff := time.Now().UTC().Add(-time.Duration(w.idleMinutes) * time.Minute) + rows, err := w.db.QueryContext(ctx, ` + SELECT id, COALESCE(provider_id, '') + FROM deployments + WHERE status = $1 + AND scaled_to_zero = false + AND always_on = false + AND last_activity_at IS NOT NULL + AND last_activity_at < $2 + AND provider_id IS NOT NULL + AND provider_id <> '' + ORDER BY last_activity_at ASC + LIMIT $3 + `, idleScalerStatusHealthy, cutoff, idleScalerBatchLimit) + if err != nil { + return nil, fmt.Errorf("listIdleCandidates: query: %w", err) + } + defer func() { _ = rows.Close() }() + + var out []idleCandidate + for rows.Next() { + var c idleCandidate + if err := rows.Scan(&c.id, &c.providerID); err != nil { + return nil, fmt.Errorf("listIdleCandidates: scan: %w", err) + } + out = append(out, c) + } + return out, rows.Err() +} + +// markScaledToZero flips scaled_to_zero=true with the same eligibility CAS as +// the candidate SELECT. Returns rows affected (0 = raced into non-eligible +// state; skip). +func (w *DeployIdleScaler) markScaledToZero(ctx context.Context, id uuid.UUID) (int64, error) { + res, err := w.db.ExecContext(ctx, ` + UPDATE deployments + SET scaled_to_zero = true, updated_at = now() + WHERE id = $1 + AND status = $2 + AND scaled_to_zero = false + AND always_on = false + `, id, idleScalerStatusHealthy) + if err != nil { + return 0, fmt.Errorf("markScaledToZero: %w", err) + } + n, _ := res.RowsAffected() + return n, nil +} + +// countAsleep returns how many deployments are currently scaled_to_zero — the +// value published to the instant_deploy_idle_apps gauge. +func (w *DeployIdleScaler) countAsleep(ctx context.Context) (int, error) { + var n int + err := w.db.QueryRowContext(ctx, ` + SELECT count(*) FROM deployments WHERE scaled_to_zero = true + `).Scan(&n) + if err != nil { + return 0, fmt.Errorf("countAsleep: %w", err) + } + return n, nil +} + +// namespaceAndNameFromProviderID derives the per-deployment namespace + +// Deployment name from provider_id = "app-". Returns ("","") for a +// provider_id not in that shape (e.g. a stack row) so the caller skips it. +func namespaceAndNameFromProviderID(providerID string) (namespace, name string) { + if !strings.HasPrefix(providerID, idleScalerProviderPfx) { + return "", "" + } + appID := strings.TrimPrefix(providerID, idleScalerProviderPfx) + if appID == "" { + return "", "" + } + return idleScalerNSPfx + appID, providerID +} diff --git a/internal/jobs/deploy_idle_scaler_test.go b/internal/jobs/deploy_idle_scaler_test.go new file mode 100644 index 0000000..7d53d2c --- /dev/null +++ b/internal/jobs/deploy_idle_scaler_test.go @@ -0,0 +1,498 @@ +package jobs + +// deploy_idle_scaler_test.go — coverage for the scale-to-zero idle-scaler +// (Task #54). SQL via sqlmock; k8s via a recording fake scale provider. +// +// Properties pinned: +// - flag OFF → Work is a total no-op: NO SQL query is issued, NO scale call. +// - k8s nil → Work short-circuits with no SQL query (fail-open). +// - happy path→ idle candidate is scaled to 0 (recorded) + DB CAS-flipped + +// the scaled_down counter + idle-apps gauge update. +// - CAS race → UPDATE returns 0 rows → counted as skipped, NOT scaled_down. +// - NotFound → torn-down Deployment is skipped (no DB flip, no failure). +// - scale err → non-NotFound k8s error increments scale_failed, row untouched. +// - constructor floors a sub-5 idle threshold to 30. +// - namespaceAndNameFromProviderID derives ns/name and rejects bad shapes. + +import ( + "context" + "errors" + "os" + "sync" + "testing" + + sqlmock "github.com/DATA-DOG/go-sqlmock" + "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/riverqueue/river" + "github.com/riverqueue/river/rivertype" + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/kubernetes" + clientfake "k8s.io/client-go/kubernetes/fake" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" + + "instant.dev/worker/internal/metrics" +) + +func idleScalerJob() *river.Job[DeployIdleScalerArgs] { + return &river.Job[DeployIdleScalerArgs]{JobRow: &rivertype.JobRow{ID: 7}} +} + +// recordingScaleProvider records every ScaleDeployment call and can be told to +// return a fixed error (e.g. a synthetic NotFound or transport failure). +type recordingScaleProvider struct { + mu sync.Mutex + calls []string // "ns/name=replicas" + err error + notFound bool +} + +func (r *recordingScaleProvider) ScaleDeployment(_ context.Context, ns, name string, replicas int32) error { + r.mu.Lock() + defer r.mu.Unlock() + r.calls = append(r.calls, ns+"/"+name) + if r.notFound { + return apierrors.NewNotFound(schema.GroupResource{Resource: "deployments"}, name) + } + return r.err +} + +func (r *recordingScaleProvider) callCount() int { + r.mu.Lock() + defer r.mu.Unlock() + return len(r.calls) +} + +// TestDeployIdleScaler_FlagOffNoOp proves the job is fully inert when the flag +// is off: sqlmock asserts NO query is issued (any query would fail the +// ExpectationsWereMet check since none are registered), and the panicking +// provider would blow up if Work reached the scale layer. +func TestDeployIdleScaler_FlagOffNoOp(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + prov := &recordingScaleProvider{err: errors.New("must not be called when flag off")} + w := NewDeployIdleScaler(db, prov, false /* enabled */, 30) + + if err := w.Work(context.Background(), idleScalerJob()); err != nil { + t.Fatalf("flag-off Work should be nil, got: %v", err) + } + if prov.callCount() != 0 { + t.Errorf("flag-off must not call ScaleDeployment; got %d calls", prov.callCount()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("flag-off must issue no SQL: %v", err) + } +} + +// TestDeployIdleScaler_NilK8sNoOp proves a nil k8s client short-circuits before +// any SQL is issued (fail-open at startup). +func TestDeployIdleScaler_NilK8sNoOp(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + w := NewDeployIdleScaler(db, nil /* k8s */, true /* enabled */, 30) + if err := w.Work(context.Background(), idleScalerJob()); err != nil { + t.Fatalf("nil-k8s Work should be nil, got: %v", err) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("nil-k8s must issue no SQL: %v", err) + } +} + +// TestDeployIdleScaler_ScalesDownIdleApp covers the happy path: one idle +// candidate → scaled to 0 + DB CAS flip + gauge sample. +func TestDeployIdleScaler_ScalesDownIdleApp(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + id := uuid.New() + mock.ExpectQuery(`SELECT id, COALESCE\(provider_id`). + WillReturnRows(sqlmock.NewRows([]string{"id", "provider_id"}). + AddRow(id, "app-abc123")) + mock.ExpectExec(`UPDATE deployments`). + WithArgs(id, "healthy"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectQuery(`SELECT count\(\*\) FROM deployments WHERE scaled_to_zero = true`). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(1)) + + prov := &recordingScaleProvider{} + w := NewDeployIdleScaler(db, prov, true, 30) + + before := testutil.ToFloat64(metrics.DeployScaledToZeroTotal.WithLabelValues("scaled_down")) + if err := w.Work(context.Background(), idleScalerJob()); err != nil { + t.Fatalf("Work: %v", err) + } + if prov.callCount() != 1 { + t.Fatalf("expected 1 ScaleDeployment call, got %d", prov.callCount()) + } + if prov.calls[0] != "instant-deploy-abc123/app-abc123" { + t.Errorf("scaled wrong target: %q", prov.calls[0]) + } + after := testutil.ToFloat64(metrics.DeployScaledToZeroTotal.WithLabelValues("scaled_down")) + if after != before+1 { + t.Errorf("scaled_down counter: before=%v after=%v", before, after) + } + if g := testutil.ToFloat64(metrics.DeployIdleApps); g != 1 { + t.Errorf("idle-apps gauge = %v; want 1", g) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet SQL expectations: %v", err) + } +} + +// TestDeployIdleScaler_CASRaceSkips covers the case where the row was already +// woken/pinned/redeployed between SELECT and UPDATE — UPDATE returns 0 rows, +// so the scaled_down counter must NOT increment. +func TestDeployIdleScaler_CASRaceSkips(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + id := uuid.New() + mock.ExpectQuery(`SELECT id, COALESCE\(provider_id`). + WillReturnRows(sqlmock.NewRows([]string{"id", "provider_id"}). + AddRow(id, "app-raced")) + mock.ExpectExec(`UPDATE deployments`). + WithArgs(id, "healthy"). + WillReturnResult(sqlmock.NewResult(0, 0)) // 0 rows = raced + mock.ExpectQuery(`SELECT count\(\*\) FROM deployments WHERE scaled_to_zero = true`). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + + prov := &recordingScaleProvider{} + w := NewDeployIdleScaler(db, prov, true, 30) + + before := testutil.ToFloat64(metrics.DeployScaledToZeroTotal.WithLabelValues("scaled_down")) + if err := w.Work(context.Background(), idleScalerJob()); err != nil { + t.Fatalf("Work: %v", err) + } + after := testutil.ToFloat64(metrics.DeployScaledToZeroTotal.WithLabelValues("scaled_down")) + if after != before { + t.Errorf("CAS-raced row must NOT increment scaled_down: before=%v after=%v", before, after) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet SQL expectations: %v", err) + } +} + +// TestDeployIdleScaler_NotFoundSkips: a torn-down Deployment (NotFound) is +// skipped — no DB flip, no failure counter. +func TestDeployIdleScaler_NotFoundSkips(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + id := uuid.New() + mock.ExpectQuery(`SELECT id, COALESCE\(provider_id`). + WillReturnRows(sqlmock.NewRows([]string{"id", "provider_id"}). + AddRow(id, "app-gone")) + // No UPDATE expected — NotFound short-circuits before the DB flip. + mock.ExpectQuery(`SELECT count\(\*\) FROM deployments WHERE scaled_to_zero = true`). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + + prov := &recordingScaleProvider{notFound: true} + w := NewDeployIdleScaler(db, prov, true, 30) + + beforeFail := testutil.ToFloat64(metrics.DeployScaledToZeroTotal.WithLabelValues("scale_failed")) + if err := w.Work(context.Background(), idleScalerJob()); err != nil { + t.Fatalf("Work: %v", err) + } + afterFail := testutil.ToFloat64(metrics.DeployScaledToZeroTotal.WithLabelValues("scale_failed")) + if afterFail != beforeFail { + t.Errorf("NotFound must NOT increment scale_failed: before=%v after=%v", beforeFail, afterFail) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet SQL expectations: %v", err) + } +} + +// TestDeployIdleScaler_ScaleErrorCounts: a non-NotFound k8s error increments +// scale_failed and leaves the row untouched (no UPDATE). +func TestDeployIdleScaler_ScaleErrorCounts(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + + id := uuid.New() + mock.ExpectQuery(`SELECT id, COALESCE\(provider_id`). + WillReturnRows(sqlmock.NewRows([]string{"id", "provider_id"}). + AddRow(id, "app-boom")) + mock.ExpectQuery(`SELECT count\(\*\) FROM deployments WHERE scaled_to_zero = true`). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + + prov := &recordingScaleProvider{err: errors.New("k8s boom")} + w := NewDeployIdleScaler(db, prov, true, 30) + + before := testutil.ToFloat64(metrics.DeployScaledToZeroTotal.WithLabelValues("scale_failed")) + if err := w.Work(context.Background(), idleScalerJob()); err != nil { + t.Fatalf("Work: %v", err) + } + after := testutil.ToFloat64(metrics.DeployScaledToZeroTotal.WithLabelValues("scale_failed")) + if after != before+1 { + t.Errorf("k8s error must increment scale_failed: before=%v after=%v", before, after) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet SQL expectations: %v", err) + } +} + +// TestNewDeployIdleScaler_FloorsIdleMinutes: a sub-5 threshold floors to 30 so +// a misconfig can't make the scaler aggressively flap apps to sleep. +func TestNewDeployIdleScaler_FloorsIdleMinutes(t *testing.T) { + w := NewDeployIdleScaler(nil, nil, true, 1) + if w.idleMinutes != 30 { + t.Errorf("sub-5 idleMinutes should floor to 30; got %d", w.idleMinutes) + } + w2 := NewDeployIdleScaler(nil, nil, true, 45) + if w2.idleMinutes != 45 { + t.Errorf("valid idleMinutes should pass through; got %d", w2.idleMinutes) + } +} + +// TestNamespaceAndNameFromProviderID covers the derivation + bad-shape rejection. +func TestNamespaceAndNameFromProviderID(t *testing.T) { + cases := []struct { + providerID string + wantNS string + wantName string + }{ + {"app-abc", "instant-deploy-abc", "app-abc"}, + {"instant-stack-xyz", "", ""}, // stack row — not ours + {"app-", "", ""}, // empty appID + {"", "", ""}, + } + for _, c := range cases { + ns, name := namespaceAndNameFromProviderID(c.providerID) + if ns != c.wantNS || name != c.wantName { + t.Errorf("namespaceAndNameFromProviderID(%q) = (%q,%q); want (%q,%q)", + c.providerID, ns, name, c.wantNS, c.wantName) + } + } +} + +// TestDeployIdleScalerArgs_Kind pins the River job kind. +func TestDeployIdleScalerArgs_Kind(t *testing.T) { + if (DeployIdleScalerArgs{}).Kind() != "deploy_idle_scaler" { + t.Errorf("Kind() = %q; want deploy_idle_scaler", (DeployIdleScalerArgs{}).Kind()) + } +} + +// TestNewK8sDeployScaleClientFromCluster_NoConfig exercises the cluster +// constructor's error path when neither in-cluster config nor a kubeconfig is +// reachable. Gated like TestNewDeployK8sClientset_NoConfig so it does not pick +// up a developer's ~/.kube/config. +func TestNewK8sDeployScaleClientFromCluster_NoConfig(t *testing.T) { + if _, err := os.Stat(clientcmd.RecommendedHomeFile); err == nil { + t.Skip("kubeconfig present on host — error path not reachable here") + } + if os.Getenv("KUBERNETES_SERVICE_HOST") != "" { + t.Skip("running in-cluster — in-cluster config will succeed") + } + if _, err := NewK8sDeployScaleClientFromCluster(); err == nil { + t.Error("expected error with no in-cluster config and no kubeconfig") + } +} + +// TestNewK8sDeployScaleClientFromCluster_Success overrides the clientset builder +// seam so the success return is exercised without a reachable cluster. A +// rest.Config pointed at an unroutable host builds a *kubernetes.Clientset +// without connecting, proving the constructor wraps it into a non-nil provider. +func TestNewK8sDeployScaleClientFromCluster_Success(t *testing.T) { + orig := newDeployScaleClientset + t.Cleanup(func() { newDeployScaleClientset = orig }) + newDeployScaleClientset = func() (*kubernetes.Clientset, error) { + return kubernetes.NewForConfig(&rest.Config{Host: "http://localhost:1"}) + } + prov, err := NewK8sDeployScaleClientFromCluster() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if prov == nil { + t.Fatal("expected non-nil scale provider") + } +} + +// TestBuildIdleScaleK8s covers both branches of the StartWorkers helper that +// wires the idle-scaler's k8s client: success returns the provider; a builder +// error returns nil (fail-open) rather than propagating. +func TestBuildIdleScaleK8s(t *testing.T) { + orig := newDeployScaleClientset + t.Cleanup(func() { newDeployScaleClientset = orig }) + + t.Run("success returns provider", func(t *testing.T) { + newDeployScaleClientset = func() (*kubernetes.Clientset, error) { + return kubernetes.NewForConfig(&rest.Config{Host: "http://localhost:1"}) + } + if got := buildIdleScaleK8s(); got == nil { + t.Fatal("expected non-nil provider on success") + } + }) + + t.Run("builder error returns nil", func(t *testing.T) { + newDeployScaleClientset = func() (*kubernetes.Clientset, error) { + return nil, errors.New("no cluster") + } + if got := buildIdleScaleK8s(); got != nil { + t.Fatalf("expected nil provider on builder error; got %v", got) + } + }) +} + +// TestDeployIdleScaler_ListQueryError: a failing candidate SELECT bubbles up as +// a job error (River retries). +func TestDeployIdleScaler_ListQueryError(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + mock.ExpectQuery(`SELECT id, COALESCE\(provider_id`). + WillReturnError(errors.New("db down")) + w := NewDeployIdleScaler(db, &recordingScaleProvider{}, true, 30) + if err := w.Work(context.Background(), idleScalerJob()); err == nil { + t.Error("list query error should fail the job") + } +} + +// TestDeployIdleScaler_ScanError: a row whose id column is a non-UUID scrap +// forces a rows.Scan error inside listIdleCandidates → job error. +func TestDeployIdleScaler_ScanError(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + mock.ExpectQuery(`SELECT id, COALESCE\(provider_id`). + WillReturnRows(sqlmock.NewRows([]string{"id", "provider_id"}). + AddRow("not-a-uuid", "app-x")) + w := NewDeployIdleScaler(db, &recordingScaleProvider{}, true, 30) + if err := w.Work(context.Background(), idleScalerJob()); err == nil { + t.Error("scan error should fail the job") + } +} + +// TestDeployIdleScaler_SkipsForeignProviderID: a candidate whose provider_id is +// not in app- shape (e.g. a stack row that slipped the SQL filter) is +// skipped without a scale call or DB flip. +func TestDeployIdleScaler_SkipsForeignProviderID(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + mock.ExpectQuery(`SELECT id, COALESCE\(provider_id`). + WillReturnRows(sqlmock.NewRows([]string{"id", "provider_id"}). + AddRow(uuid.New(), "instant-stack-xyz")) + mock.ExpectQuery(`SELECT count\(\*\) FROM deployments WHERE scaled_to_zero = true`). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + prov := &recordingScaleProvider{} + w := NewDeployIdleScaler(db, prov, true, 30) + if err := w.Work(context.Background(), idleScalerJob()); err != nil { + t.Fatalf("Work: %v", err) + } + if prov.callCount() != 0 { + t.Errorf("foreign provider_id must not be scaled; got %d calls", prov.callCount()) + } +} + +// TestDeployIdleScaler_DBFlipError: a failing scaled_to_zero UPDATE after a +// successful scale increments scale_failed (the row is retried next tick). +func TestDeployIdleScaler_DBFlipError(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + id := uuid.New() + mock.ExpectQuery(`SELECT id, COALESCE\(provider_id`). + WillReturnRows(sqlmock.NewRows([]string{"id", "provider_id"}).AddRow(id, "app-dbflip")) + mock.ExpectExec(`UPDATE deployments`). + WithArgs(id, "healthy"). + WillReturnError(errors.New("update exploded")) + mock.ExpectQuery(`SELECT count\(\*\) FROM deployments WHERE scaled_to_zero = true`). + WillReturnRows(sqlmock.NewRows([]string{"count"}).AddRow(0)) + prov := &recordingScaleProvider{} + w := NewDeployIdleScaler(db, prov, true, 30) + + before := testutil.ToFloat64(metrics.DeployScaledToZeroTotal.WithLabelValues("scale_failed")) + if err := w.Work(context.Background(), idleScalerJob()); err != nil { + t.Fatalf("Work: %v", err) + } + after := testutil.ToFloat64(metrics.DeployScaledToZeroTotal.WithLabelValues("scale_failed")) + if after != before+1 { + t.Errorf("db-flip error must increment scale_failed: before=%v after=%v", before, after) + } +} + +// TestDeployIdleScaler_GaugeSampleError: a failing countAsleep query is logged +// but does not fail the job (the scale-down already succeeded). +func TestDeployIdleScaler_GaugeSampleError(t *testing.T) { + db, mock, err := sqlmock.New() + if err != nil { + t.Fatalf("sqlmock.New: %v", err) + } + defer db.Close() + id := uuid.New() + mock.ExpectQuery(`SELECT id, COALESCE\(provider_id`). + WillReturnRows(sqlmock.NewRows([]string{"id", "provider_id"}).AddRow(id, "app-g")) + mock.ExpectExec(`UPDATE deployments`). + WithArgs(id, "healthy"). + WillReturnResult(sqlmock.NewResult(0, 1)) + mock.ExpectQuery(`SELECT count\(\*\) FROM deployments WHERE scaled_to_zero = true`). + WillReturnError(errors.New("count failed")) + w := NewDeployIdleScaler(db, &recordingScaleProvider{}, true, 30) + if err := w.Work(context.Background(), idleScalerJob()); err != nil { + t.Fatalf("gauge-sample error must not fail the job, got: %v", err) + } +} + +// TestK8sDeployScaleClient_ScaleDeployment covers the production scale client +// against a fake clientset: scale a seeded Deployment to 0, an already-at-target +// no-op, and a NotFound on a missing Deployment. +func TestK8sDeployScaleClient_ScaleDeployment(t *testing.T) { + one := int32(1) + cs := clientfake.NewSimpleClientset(&appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{Name: "app-x", Namespace: "instant-deploy-x"}, + Spec: appsv1.DeploymentSpec{Replicas: &one}, + }) + c := NewK8sDeployScaleClient(cs) + + // Scale down to 0. + if err := c.ScaleDeployment(context.Background(), "instant-deploy-x", "app-x", 0); err != nil { + t.Fatalf("ScaleDeployment(0): %v", err) + } + got, _ := cs.AppsV1().Deployments("instant-deploy-x").Get(context.Background(), "app-x", metav1.GetOptions{}) + if got.Spec.Replicas == nil || *got.Spec.Replicas != 0 { + t.Errorf("replicas after scale-down = %v; want 0", got.Spec.Replicas) + } + + // Already at 0 → idempotent no-op (no error). + if err := c.ScaleDeployment(context.Background(), "instant-deploy-x", "app-x", 0); err != nil { + t.Errorf("ScaleDeployment(0) idempotent should be nil: %v", err) + } + + // Missing Deployment → NotFound surfaced (caller maps to skip). + err := c.ScaleDeployment(context.Background(), "instant-deploy-missing", "app-missing", 0) + if !apierrors.IsNotFound(err) { + t.Errorf("ScaleDeployment on missing Deployment should return NotFound, got: %v", err) + } +} diff --git a/internal/jobs/workers.go b/internal/jobs/workers.go index 7387140..197eb88 100644 --- a/internal/jobs/workers.go +++ b/internal/jobs/workers.go @@ -317,6 +317,23 @@ func newMinioAdminClient(cfg *config.Config) (*madmin.AdminClient, error) { }) } +// buildIdleScaleK8s constructs the scale-to-zero idle-scaler's k8s client from +// cluster config. Returns nil (NOT an error) when no cluster is reachable +// (CI / docker-compose) so StartWorkers stays fail-open: the worker warn-logs +// and the idle-scaler short-circuits each tick while every other periodic job +// keeps running. Extracted from StartWorkers so the success/failure branches +// are unit-testable without a live River DB. +func buildIdleScaleK8s() deployScaleK8sProvider { + scaleClient, scErr := NewK8sDeployScaleClientFromCluster() + if scErr != nil { + slog.Warn("workers.deploy_idle_scaler.k8s_client_unavailable", + "error", scErr, + "note", "idle-scaler will short-circuit each tick until the worker restarts with a reachable cluster") + return nil + } + return scaleClient +} + func StartWorkers(ctx context.Context, db *sql.DB, rdb *redis.Client, cfg *config.Config, provClient *provisioner.Client, planRegistry PlanRegistry, backupPlans BackupPlanRegistry, deployStatusK8s deployStatusK8sProvider, deployAutopsyK8s deployAutopsyK8sProvider, nrApp *newrelic.Application) *Workers { // rdb is used by LoopsEventForwarderWorker (cursor storage). Other // workers access redis indirectly via the platform DB. @@ -506,6 +523,15 @@ func StartWorkers(ctx context.Context, db *sql.DB, rdb *redis.Client, cfg *confi statusReconciler := NewDeployStatusReconciler(db, deployStatusK8s). WithAutopsyK8s(deployAutopsyK8s) river.AddWorker(workers, WithObservability(statusReconciler, nrApp)) + // Scale-to-zero idle-scaler (Task #54). INERT unless + // DEPLOY_SCALE_TO_ZERO_ENABLED is set (default off). Builds its own scale + // client from cluster config; nil when unreachable (CI / docker-compose) → + // the worker warn-logs each tick and other periodic jobs keep running. See + // deploy_idle_scaler.go for the idle-signal + cold-start design notes. + idleScaleK8s := buildIdleScaleK8s() + river.AddWorker(workers, WithObservability( + NewDeployIdleScaler(db, idleScaleK8s, cfg.DeployScaleToZeroEnabled, cfg.DeployScaleToZeroIdleMinutes), + nrApp)) // Event-email forwarder — drains audit_log rows into the configured // provider every 60s for lifecycle email triggering. The provider is // always non-nil (NoopProvider when EMAIL_PROVIDER is unset). See @@ -1126,6 +1152,18 @@ func buildPeriodicJobs(cfg *config.Config) []*river.PeriodicJob { }, &river.PeriodicJobOpts{RunOnStart: true}, ), + // Scale-to-zero idle-scaler (Task #54) — every 2 min. INERT unless + // DEPLOY_SCALE_TO_ZERO_ENABLED (the worker is registered regardless; its + // Work() short-circuits when the flag is off). RunOnStart=false: there's + // no backlog to drain on boot and we don't want a worker restart to + // immediately deschedule apps before the first idle window elapses. + river.NewPeriodicJob( + river.PeriodicInterval(deployIdleScalerInterval), + func() (river.JobArgs, *river.InsertOpts) { + return DeployIdleScalerArgs{}, reconcileInsertOpts(deployIdleScalerInterval) + }, + &river.PeriodicJobOpts{RunOnStart: false}, + ), // Magic-link reconciler — every 60s. RunOnStart=true so a worker // restart immediately drains rows whose first send failed while // the worker was down (we have a 15-min TTL window to retry, so diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 8ed991d..d400534 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -123,6 +123,39 @@ var ( Help: "Deployments soft-deleted (status='expired') by the expirer worker.", }) + // ── scale-to-zero (deploy_idle_scaler.go, Task #54) ────────────────────── + // + // DeployScaledToZeroTotal increments once per scale action, labelled by + // outcome: + // outcome="scaled_down" — an idle app was descheduled to replicas=0 + // (k8s patch + DB flip both succeeded). The + // happy "we saved compute" path. + // outcome="woke_up" — reserved for a worker-initiated wake (the api + // wake endpoint owns the user-initiated path); + // present so the dashboard series exists. + // outcome="wake_failed" — a wake/scale-up attempt failed (k8s error). + // P1 if > 0: a user's app may be stuck asleep. + // outcome="scale_failed" — a scale-DOWN k8s patch failed (the row is left + // untouched and retried next tick). P2 observ. + // + // NR alert: deploy-scale-to-zero-fail.json (wake_failed > 0 → P1; + // scale_failed sustained → P2). Prom rule: DeployScaleToZeroFailures. + // Dashboard tile: infra/newrelic/dashboards/instanode-reliability.json. + // Catalog: infra/observability/METRICS-CATALOG.md (lazy *Vec — label + // families primed in metrics_test.go so /metrics exposes them from start). + DeployScaledToZeroTotal = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "instant_deploy_scaled_to_zero_total", + Help: "Scale-to-zero idle-scaler actions by outcome (scaled_down | woke_up | wake_failed | scale_failed).", + }, []string{"outcome"}) + + // DeployIdleApps is the gauge of apps observed asleep (scaled_to_zero=true) + // at the end of each idle-scaler tick. Tracks the descheduled fleet size — + // the headline "how much compute scale-to-zero is reclaiming" signal. + DeployIdleApps = promauto.NewGauge(prometheus.GaugeOpts{ + Name: "instant_deploy_idle_apps", + Help: "Deployments currently scaled to zero (asleep), sampled each idle-scaler tick.", + }) + // DeployRemindersSentTotal counts reminder emails actually dispatched // to a real owner email (post-CAS, post-email-send). DeployRemindersSentTotal = promauto.NewCounter(prometheus.CounterOpts{ diff --git a/internal/metrics/metrics_test.go b/internal/metrics/metrics_test.go index 60fcfdb..6d6f596 100644 --- a/internal/metrics/metrics_test.go +++ b/internal/metrics/metrics_test.go @@ -108,6 +108,16 @@ func TestAllMetrics_AreRegistered(t *testing.T) { DeployAutopsyCapturedTotal.WithLabelValues("logs_unavailable").Add(0) DeployAutopsyCapturedTotal.WithLabelValues("already_present").Add(0) DeployAutopsyCapturedTotal.WithLabelValues("audit_emit_failed").Add(0) + // Prime all four scale-to-zero outcome label values so /metrics exposes the + // series from process start (lazy *Vec otherwise leaves the dashboard tile + // empty until the first real scale action). + DeployScaledToZeroTotal.WithLabelValues("scaled_down").Add(0) + DeployScaledToZeroTotal.WithLabelValues("woke_up").Add(0) + DeployScaledToZeroTotal.WithLabelValues("wake_failed").Add(0) + DeployScaledToZeroTotal.WithLabelValues("scale_failed").Add(0) + + // Plain gauge + DeployIdleApps.Set(0) // Gauge vecs ResourceDegradedGauge.WithLabelValues("postgres").Set(0)