From 05fae569eac77a1d255df094f5dd255e60103553 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Sun, 7 Jun 2026 20:03:43 +0530 Subject: [PATCH] fix(provisioner): bound ProvisionCache gRPC with a 45s deadline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Authed /cache/new (PRO Redis provision) was observed hanging >60s while anonymous /cache/new returned in ~6s. The provisioner-side root cause is fixed separately; this is the defensive api-side guard. provClient ALREADY applies a per-call deadline to every provision RPC via provisionTimeout(tier) — the issue is the value: 4m (anon/free/hobby) and 5m (pro/team/growth). Those budgets exist for Postgres/Mongo, which spin up a per-tenant pod (PVC bind + image pull + DB init, 30-90s on a cold node). A Redis namespace carve does none of that (~1-6s for every tier), so a hung provisioner could hang the whole /cache/new request for up to 5 minutes. Introduce cacheProvisionTimeout = 45s (named, no inline magic number; a package var so the timeout->503 unit test can shrink it) and use it in ProvisionCache only. 45s is ~7x the slowest observed healthy carve, so a legitimately slow-but-OK provision still succeeds while a genuine hang fails fast. On timeout the existing handler path runs unchanged: soft-delete the pending resource + 503 provision_failed — no orphan, no hang. db/vector/nosql/queue intentionally keep provisionTimeout(tier): they are pod-backed and genuinely need the cold-pod budget; tightening them to 45s would regress legitimate slow provisions. Noted as a deliberate non-change. Tests (internal/provisioner, no DB needed): - TestProvisionCache_HangBecomesDeadlineError: blocking mock provisioner -> ProvisionCache returns gRPC DeadlineExceeded in ~the bounded window, not an indefinite hang (the error the handler turns into a 503). - TestCacheProvisionTimeout_Value: pins 45s and asserts it stays tighter than provisionTimeout(pro) so a future edit can't silently reintroduce the multi-minute hang. ProvisionCache coverage 100%. Co-Authored-By: Claude Opus 4.8 (1M context) --- internal/provisioner/client.go | 27 +++++++- internal/provisioner/client_cov_test.go | 85 +++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 1 deletion(-) diff --git a/internal/provisioner/client.go b/internal/provisioner/client.go index 939d5973..f0cb5be5 100644 --- a/internal/provisioner/client.go +++ b/internal/provisioner/client.go @@ -33,6 +33,28 @@ const ( provisionerCircuitCooldown = 30 * time.Second ) +// cacheProvisionTimeout bounds the api → provisioner ProvisionCache RPC. +// +// Unlike Postgres/Mongo, a Redis namespace provision does NOT spin up a +// per-tenant pod with a PVC bind + image pull + DB init — it carves a +// namespace/ACL out of an already-running Redis (observed ~1–6s end to +// end, anonymous AND authenticated). It therefore does not need the +// 4–5m cold-pod budget that provisionTimeout() grants the pod-backed +// resource types. +// +// Granting Redis that same multi-minute budget means a *hung* provisioner +// hangs the whole /cache/new request for up to 5 minutes (the symptom that +// motivated this fix: authed /cache/new observed hanging >60s while anon +// returned in ~6s). A generous-but-bounded 45s ceiling is ~7× the slowest +// observed healthy provision, so a legitimately slow-but-OK Redis carve +// still succeeds, while a genuine hang fails fast — the existing handler +// error path then soft-deletes the pending resource and returns a clean +// 503 provision_failed instead of an indefinite hang. +// +// A package var (not const) so the timeout→503 unit test can shrink it to +// a few ms instead of blocking the suite for 45s; production never mutates it. +var cacheProvisionTimeout = 45 * time.Second + // Credentials matches the shape returned by local providers. type Credentials struct { URL string @@ -313,7 +335,10 @@ func (c *Client) ProvisionPostgres(ctx context.Context, token, tier, teamID stri func (c *Client) ProvisionCache(ctx context.Context, token, tier, teamID string) (*Credentials, error) { return callWithBreaker(c.breaker, func() (*Credentials, error) { start := time.Now() - ctx, cancel := context.WithTimeout(c.ctxWithTeamID(c.ctxWithAuth(ctx), teamID), provisionTimeout(tier)) + // Redis carve is fast for every tier — bound it tighter than the + // pod-backed resource types so a hung provisioner returns a clean + // 503 in ≤45s instead of hanging /cache/new for minutes. + ctx, cancel := context.WithTimeout(c.ctxWithTeamID(c.ctxWithAuth(ctx), teamID), cacheProvisionTimeout) defer cancel() resp, err := c.grpc.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{ Token: token, diff --git a/internal/provisioner/client_cov_test.go b/internal/provisioner/client_cov_test.go index f92a6dc6..76e64b81 100644 --- a/internal/provisioner/client_cov_test.go +++ b/internal/provisioner/client_cov_test.go @@ -43,6 +43,11 @@ type covServer struct { // Override ProvisionResource success payload. provisionResp *provisionerv1.ProvisionResponse + // If non-nil, ProvisionResource blocks until this channel is closed OR + // the RPC context is cancelled (whichever comes first) — simulating a + // hung provisioner so the client-side deadline can be exercised. + provisionBlock chan struct{} + // Sniffed inbound metadata from the most recent ProvisionResource call. lastAuthToken []string lastRequestID []string @@ -63,6 +68,15 @@ func (s *covServer) ProvisionResource(ctx context.Context, req *provisionerv1.Pr s.lastRequestID = md.Get("x-request-id") s.lastTeamID = md.Get("x-instant-team-id") } + if s.provisionBlock != nil { + // Simulate a hung provisioner: block until released or the RPC + // context (carrying the client's deadline) is cancelled. + select { + case <-s.provisionBlock: + case <-ctx.Done(): + return nil, status.FromContextError(ctx.Err()).Err() + } + } if s.provisionErr != nil { return nil, s.provisionErr } @@ -411,6 +425,77 @@ func TestProvisionCache_SuccessAndError(t *testing.T) { } } +// TestCacheProvisionTimeout_Value pins the production deadline so a future +// edit that re-grants Redis the multi-minute cold-pod budget (re-introducing +// the /cache/new hang) reds the suite. +func TestCacheProvisionTimeout_Value(t *testing.T) { + if cacheProvisionTimeout != 45*time.Second { + t.Errorf("cacheProvisionTimeout = %s; want 45s (generous-but-bounded Redis carve ceiling)", cacheProvisionTimeout) + } + // Must be well under provisionTimeout's pod-backed budget — that gap is + // the whole point: a hung provisioner fails /cache/new fast instead of + // hanging it for minutes. + if cacheProvisionTimeout >= provisionTimeout("pro") { + t.Errorf("cacheProvisionTimeout (%s) must be tighter than provisionTimeout(pro) (%s)", + cacheProvisionTimeout, provisionTimeout("pro")) + } +} + +// TestProvisionCache_HangBecomesDeadlineError proves the defensive fix: when +// the provisioner hangs, ProvisionCache's own client-side deadline fires and +// returns a DeadlineExceeded-class error in ~the bounded window — NOT an +// indefinite block. This is the error the cache handler converts into a 503 +// provision_failed after soft-deleting the pending resource. +// +// We shrink cacheProvisionTimeout to a few ms (restored via t.Cleanup) so the +// test is fast, and give the *caller* context a generous timeout so the error +// is provably caused by our internal deadline, not the caller's. +func TestProvisionCache_HangBecomesDeadlineError(t *testing.T) { + orig := cacheProvisionTimeout + cacheProvisionTimeout = 50 * time.Millisecond + t.Cleanup(func() { cacheProvisionTimeout = orig }) + + h := newCovHarness(t, "hang-secret") + defer h.close() + + // Server blocks forever (channel never closed) → only the client-side + // deadline can end the RPC. + h.mock.provisionBlock = make(chan struct{}) + + // Caller context is far more generous than the 50ms internal deadline, + // so a failure here is unambiguously our deadline firing. + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + done := make(chan error, 1) + start := time.Now() + go func() { + _, err := h.client.ProvisionCache(ctx, "tok", "pro", "team-hang") + done <- err + }() + + select { + case err := <-done: + elapsed := time.Since(start) + if err == nil { + t.Fatal("ProvisionCache returned nil; want a DeadlineExceeded-class error from the bounded deadline") + } + if !strings.Contains(err.Error(), "provisioner.ProvisionCache:") { + t.Errorf("err = %v; want wrapped ProvisionCache error", err) + } + if st, ok := status.FromError(err); !ok || st.Code() != codes.DeadlineExceeded { + t.Errorf("err = %v; want gRPC DeadlineExceeded", err) + } + // Bounded: must finish well within the caller's 10s, near the 50ms + // internal deadline — proves the api no longer hangs on a hung provisioner. + if elapsed > 5*time.Second { + t.Errorf("ProvisionCache took %s; expected ~the bounded deadline (50ms), not an indefinite hang", elapsed) + } + case <-time.After(8 * time.Second): + t.Fatal("ProvisionCache hung past the bounded deadline — the defensive timeout did not fire") + } +} + func TestProvisionNoSQL_SuccessAndError(t *testing.T) { h := newCovHarness(t, "mongo-secret") defer h.close()