databricks · varundeepsaini · Dec 15, 2025 · Dec 16, 2025 · Dec 17, 2025 · Dec 30, 2025
@@ -156,6 +156,12 @@ type ServerStub struct {
 	// Configure as "1ms", "2s", "3m", etc.
 	// See [time.ParseDuration] for details.
 	Delay time.Duration
+
+	// Number of times to kill the caller process before returning normal responses.
+	// 0 = never kill (default), 1 = kill once then allow, 2 = kill twice then allow, etc.
+	// Useful for testing crash recovery scenarios where first deploy crashes but retry succeeds.
+	// Requires DATABRICKS_CLI_TEST_PID=1 to be set in the test environment.
+	KillCaller int
 }
 
 // FindConfigs finds all the config relevant for this test,

@@ -184,13 +184,22 @@ func startLocalServer(t *testing.T,
 		s.ResponseCallback = logResponseCallback(t)
 	}
 
+	// Track remaining kill counts per pattern (for KillCaller > 0)
+	killCounters := make(map[string]int)
+
 	for ind := range stubs {
 		// We want later stubs takes precedence, because then leaf configs take precedence over parent directory configs
 		// In gorilla/mux earlier handlers take precedence, so we need to reverse the order
 		stub := stubs[len(stubs)-1-ind]
 		require.NotEmpty(t, stub.Pattern)
 		items := strings.Split(stub.Pattern, " ")
 		require.Len(t, items, 2)
+
+		// Initialize kill counter for this pattern
+		if stub.KillCaller > 0 {
+			killCounters[stub.Pattern] = stub.KillCaller
+		}
+
 		s.Handle(items[0], items[1], func(req testserver.Request) any {
 			if stub.Delay > 0 {
 				ctx := req.Context
@@ -209,6 +218,11 @@ func startLocalServer(t *testing.T,
 				}
 			}
 
+			if shouldKillCaller(stub, killCounters) {
+				killCaller(t, stub.Pattern, req.Headers)
+				return testserver.Response{StatusCode: http.StatusOK}
+			}
+
 			return stub.Response
 		})
 	}
@@ -218,6 +232,37 @@ func startLocalServer(t *testing.T,
 	return s.URL
 }
 
+func shouldKillCaller(stub ServerStub, killCounters map[string]int) bool {
+	if stub.KillCaller <= 0 || killCounters[stub.Pattern] <= 0 {
+		return false
+	}
+	killCounters[stub.Pattern]--
+	return true
+}
+
+func killCaller(t *testing.T, pattern string, headers http.Header) {
+	pid := testserver.ExtractPidFromHeaders(headers)
+	if pid == 0 {
+		t.Errorf("KillCaller configured but test-pid not found in User-Agent")
+		return
+	}
+
+	process, err := os.FindProcess(pid)
+	if err != nil {
+		t.Errorf("Failed to find process %d: %s", pid, err)
+		return
+	}
+
+	// Use process.Kill() for cross-platform compatibility.
+	// On Unix, this sends SIGKILL. On Windows, this calls TerminateProcess.
+	if err := process.Kill(); err != nil {
+		t.Errorf("Failed to kill process %d: %s", pid, err)
+		return
+	}
+
+	t.Logf("KillCaller: killed PID %d (pattern: %s)", pid, pattern)
+}
+
 func startProxyServer(t *testing.T,
 	logRequests bool,
 	includeHeaders []string,

@@ -0,0 +1,6 @@
+
+>>> errcode [CLI] current-user me
+[PROCESS_KILLED]
+
+Exit code: [KILLED]
+Script continued after kill
@@ -0,0 +1,2 @@
+trace errcode $CLI current-user me
+echo "Script continued after kill"
@@ -0,0 +1,4 @@
+# Kill the CLI when it calls /Me endpoint (once, then allow)
+[[Server]]
+Pattern = "GET /api/2.0/preview/scim/v2/Me"
+KillCaller = 1
@@ -0,0 +1,25 @@
+
+>>> errcode [CLI] current-user me
+[PROCESS_KILLED]
+
+Exit code: [KILLED]
+Attempt 1 done
+
+>>> errcode [CLI] current-user me
+[PROCESS_KILLED]
+
+Exit code: [KILLED]
+Attempt 2 done
+
+>>> errcode [CLI] current-user me
+[PROCESS_KILLED]
+
+Exit code: [KILLED]
+Attempt 3 done
+
+>>> [CLI] current-user me
+{
+  "id":"123",
+  "userName":"test@example.com"
+}
+Attempt 4 done - success!
@@ -0,0 +1,13 @@
+# First 3 attempts should be killed
+trace errcode $CLI current-user me
+echo "Attempt 1 done"
+
+trace errcode $CLI current-user me
+echo "Attempt 2 done"
+
+trace errcode $CLI current-user me
+echo "Attempt 3 done"
+
+# 4th attempt should succeed
+trace $CLI current-user me
+echo "Attempt 4 done - success!"
@@ -0,0 +1,10 @@
+# Kill the CLI 3 times, then allow the 4th request to succeed
+[[Server]]
+Pattern = "GET /api/2.0/preview/scim/v2/Me"
+KillCaller = 3
+Response.Body = '''
+{
+    "id": "123",
+    "userName": "test@example.com"
+}
+'''
@@ -0,0 +1,30 @@
+# KillCaller tests verify the test server's ability to terminate CLI processes mid-request.
+# This enables testing crash recovery scenarios, e.g., "bundle deploy" fails on first attempt
+# but succeeds on retry. Each subdirectory tests a different endpoint or retry count.
+
+Local = true
+Env.DATABRICKS_CLI_TEST_PID = "1"
+
+[[Repls]]
+# macOS bash shows "Killed: 9" (with signal number), Linux shows "Killed"
+# Normalize the whole killed line to a placeholder
+Old = 'script: line \d+:\s+\d+ Killed(: 9)?\s+"\$@"'
+New = '[PROCESS_KILLED]'
+
+[[Repls]]
+# On Windows, there's no "Killed" message - just empty line before Exit code
+# Insert [PROCESS_KILLED] placeholder for consistency
+Old = '(\n>>> errcode [^\n]+\n)\nExit code:'
+New = """${1}[PROCESS_KILLED]
+
+Exit code:"""
+
+[[Repls]]
+# Normalize exit code: 137 on Unix (128 + SIGKILL), 1 on Windows
+Old = 'Exit code: (137|1)'
+New = 'Exit code: [KILLED]'
+
+[[Repls]]
+# Normalize Windows line endings (CRLF -> LF) - must be LAST
+Old = "\r"
+New = ''
@@ -0,0 +1,6 @@
+
+>>> errcode [CLI] workspace list /
+[PROCESS_KILLED]
+
+Exit code: [KILLED]
+Script continued after kill
@@ -0,0 +1,2 @@
+trace errcode $CLI workspace list /
+echo "Script continued after kill"
@@ -0,0 +1,4 @@
+# Kill the CLI when it calls workspace list endpoint (once, then allow)
+[[Server]]
+Pattern = "GET /api/2.0/workspace/list"
+KillCaller = 1
@@ -8,6 +8,7 @@ import (
 	"os"
 	"strings"
 
+	"github.com/databricks/cli/cmd/root"
 	"github.com/databricks/cli/internal/build"
 	"github.com/databricks/cli/libs/log"
 	"github.com/spf13/cobra"
@@ -72,6 +73,7 @@ func New(ctx context.Context) *cobra.Command {
 		ctx = withCommandInUserAgent(ctx, cmd)
 		ctx = withCommandExecIdInUserAgent(ctx)
 		ctx = withUpstreamInUserAgent(ctx)
+		ctx = root.InjectTestPidToUserAgent(ctx)
 		cmd.SetContext(ctx)
 		return nil
 	}

@@ -79,6 +79,7 @@ func New(ctx context.Context) *cobra.Command {
 		ctx = withCommandInUserAgent(ctx, cmd)
 		ctx = withCommandExecIdInUserAgent(ctx)
 		ctx = withUpstreamInUserAgent(ctx)
+		ctx = InjectTestPidToUserAgent(ctx)
 		cmd.SetContext(ctx)
 		return nil
 	}

@@ -0,0 +1,28 @@
+package root
+
+import (
+	"context"
+	"os"
+	"strconv"
+
+	"github.com/databricks/cli/libs/env"
+	"github.com/databricks/databricks-sdk-go/useragent"
+)
+
+const (
+	// TestPidEnvVar is the environment variable that enables PID injection into the user agent.
+	// When set to "1", the CLI will include its process ID in the user agent string.
+	// This is used by the test server to identify and signal the CLI process.
+	TestPidEnvVar = "DATABRICKS_CLI_TEST_PID"
+	testPidKey    = "test-pid"
+)
+
+// InjectTestPidToUserAgent adds the current process ID to the user agent if
+// DATABRICKS_CLI_TEST_PID=1 is set. This enables the test server to identify
+// and signal this process during acceptance tests.
+func InjectTestPidToUserAgent(ctx context.Context) context.Context {
+	if env.Get(ctx, TestPidEnvVar) != "1" {
+		return ctx
+	}
+	return useragent.InContext(ctx, testPidKey, strconv.Itoa(os.Getpid()))
+}
@@ -10,14 +10,32 @@ import (
 	"net/http/httptest"
 	"net/url"
 	"reflect"
+	"regexp"
+	"strconv"
 	"strings"
 	"sync"
 
-	"github.com/gorilla/mux"
-
 	"github.com/databricks/cli/internal/testutil"
+	"github.com/gorilla/mux"
 )
 
+const testPidKey = "test-pid"
+
+var testPidRegex = regexp.MustCompile(testPidKey + `/(\d+)`)
+
+func ExtractPidFromHeaders(headers http.Header) int {
+	ua := headers.Get("User-Agent")
+	matches := testPidRegex.FindStringSubmatch(ua)
+	if len(matches) < 2 {
+		return 0
+	}
+	pid, err := strconv.Atoi(matches[1])
+	if err != nil {
+		return 0
+	}
+	return pid
+}
+
 type Server struct {
 	*httptest.Server
 	Router *mux.Router
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		trace errcode $CLI current-user me
varundeepsaini marked this conversation as resolved. Show resolved Hide resolved
		echo "Script continued after kill"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		trace errcode $CLI workspace list /
		echo "Script continued after kill"