Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion internal/provisioner/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,28 @@ const (
provisionerCircuitCooldown = 30 * time.Second
)

// cacheProvisionTimeout bounds the api → provisioner ProvisionCache RPC.
//
// Unlike Postgres/Mongo, a Redis namespace provision does NOT spin up a
// per-tenant pod with a PVC bind + image pull + DB init — it carves a
// namespace/ACL out of an already-running Redis (observed ~1–6s end to
// end, anonymous AND authenticated). It therefore does not need the
// 4–5m cold-pod budget that provisionTimeout() grants the pod-backed
// resource types.
//
// Granting Redis that same multi-minute budget means a *hung* provisioner
// hangs the whole /cache/new request for up to 5 minutes (the symptom that
// motivated this fix: authed /cache/new observed hanging >60s while anon
// returned in ~6s). A generous-but-bounded 45s ceiling is ~7× the slowest
// observed healthy provision, so a legitimately slow-but-OK Redis carve
// still succeeds, while a genuine hang fails fast — the existing handler
// error path then soft-deletes the pending resource and returns a clean
// 503 provision_failed instead of an indefinite hang.
//
// A package var (not const) so the timeout→503 unit test can shrink it to
// a few ms instead of blocking the suite for 45s; production never mutates it.
var cacheProvisionTimeout = 45 * time.Second

// Credentials matches the shape returned by local providers.
type Credentials struct {
URL string
Expand Down Expand Up @@ -313,7 +335,10 @@ func (c *Client) ProvisionPostgres(ctx context.Context, token, tier, teamID stri
func (c *Client) ProvisionCache(ctx context.Context, token, tier, teamID string) (*Credentials, error) {
return callWithBreaker(c.breaker, func() (*Credentials, error) {
start := time.Now()
ctx, cancel := context.WithTimeout(c.ctxWithTeamID(c.ctxWithAuth(ctx), teamID), provisionTimeout(tier))
// Redis carve is fast for every tier — bound it tighter than the
// pod-backed resource types so a hung provisioner returns a clean
// 503 in ≤45s instead of hanging /cache/new for minutes.
ctx, cancel := context.WithTimeout(c.ctxWithTeamID(c.ctxWithAuth(ctx), teamID), cacheProvisionTimeout)
defer cancel()
resp, err := c.grpc.ProvisionResource(ctx, &provisionerv1.ProvisionRequest{
Token: token,
Expand Down
85 changes: 85 additions & 0 deletions internal/provisioner/client_cov_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ type covServer struct {
// Override ProvisionResource success payload.
provisionResp *provisionerv1.ProvisionResponse

// If non-nil, ProvisionResource blocks until this channel is closed OR
// the RPC context is cancelled (whichever comes first) — simulating a
// hung provisioner so the client-side deadline can be exercised.
provisionBlock chan struct{}

// Sniffed inbound metadata from the most recent ProvisionResource call.
lastAuthToken []string
lastRequestID []string
Expand All @@ -63,6 +68,15 @@ func (s *covServer) ProvisionResource(ctx context.Context, req *provisionerv1.Pr
s.lastRequestID = md.Get("x-request-id")
s.lastTeamID = md.Get("x-instant-team-id")
}
if s.provisionBlock != nil {
// Simulate a hung provisioner: block until released or the RPC
// context (carrying the client's deadline) is cancelled.
select {
case <-s.provisionBlock:
case <-ctx.Done():
return nil, status.FromContextError(ctx.Err()).Err()
}
}
if s.provisionErr != nil {
return nil, s.provisionErr
}
Expand Down Expand Up @@ -411,6 +425,77 @@ func TestProvisionCache_SuccessAndError(t *testing.T) {
}
}

// TestCacheProvisionTimeout_Value pins the production deadline so a future
// edit that re-grants Redis the multi-minute cold-pod budget (re-introducing
// the /cache/new hang) reds the suite.
func TestCacheProvisionTimeout_Value(t *testing.T) {
if cacheProvisionTimeout != 45*time.Second {
t.Errorf("cacheProvisionTimeout = %s; want 45s (generous-but-bounded Redis carve ceiling)", cacheProvisionTimeout)
}
// Must be well under provisionTimeout's pod-backed budget — that gap is
// the whole point: a hung provisioner fails /cache/new fast instead of
// hanging it for minutes.
if cacheProvisionTimeout >= provisionTimeout("pro") {
t.Errorf("cacheProvisionTimeout (%s) must be tighter than provisionTimeout(pro) (%s)",
cacheProvisionTimeout, provisionTimeout("pro"))
}
}

// TestProvisionCache_HangBecomesDeadlineError proves the defensive fix: when
// the provisioner hangs, ProvisionCache's own client-side deadline fires and
// returns a DeadlineExceeded-class error in ~the bounded window — NOT an
// indefinite block. This is the error the cache handler converts into a 503
// provision_failed after soft-deleting the pending resource.
//
// We shrink cacheProvisionTimeout to a few ms (restored via t.Cleanup) so the
// test is fast, and give the *caller* context a generous timeout so the error
// is provably caused by our internal deadline, not the caller's.
func TestProvisionCache_HangBecomesDeadlineError(t *testing.T) {
orig := cacheProvisionTimeout
cacheProvisionTimeout = 50 * time.Millisecond
t.Cleanup(func() { cacheProvisionTimeout = orig })

h := newCovHarness(t, "hang-secret")
defer h.close()

// Server blocks forever (channel never closed) → only the client-side
// deadline can end the RPC.
h.mock.provisionBlock = make(chan struct{})

// Caller context is far more generous than the 50ms internal deadline,
// so a failure here is unambiguously our deadline firing.
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

done := make(chan error, 1)
start := time.Now()
go func() {
_, err := h.client.ProvisionCache(ctx, "tok", "pro", "team-hang")
done <- err
}()

select {
case err := <-done:
elapsed := time.Since(start)
if err == nil {
t.Fatal("ProvisionCache returned nil; want a DeadlineExceeded-class error from the bounded deadline")
}
if !strings.Contains(err.Error(), "provisioner.ProvisionCache:") {
t.Errorf("err = %v; want wrapped ProvisionCache error", err)
}
if st, ok := status.FromError(err); !ok || st.Code() != codes.DeadlineExceeded {
t.Errorf("err = %v; want gRPC DeadlineExceeded", err)
}
// Bounded: must finish well within the caller's 10s, near the 50ms
// internal deadline — proves the api no longer hangs on a hung provisioner.
if elapsed > 5*time.Second {
t.Errorf("ProvisionCache took %s; expected ~the bounded deadline (50ms), not an indefinite hang", elapsed)
}
case <-time.After(8 * time.Second):
t.Fatal("ProvisionCache hung past the bounded deadline — the defensive timeout did not fire")
}
}

func TestProvisionNoSQL_SuccessAndError(t *testing.T) {
h := newCovHarness(t, "mongo-secret")
defer h.close()
Expand Down
Loading