From df3f0bcee14882e292ebe892f26eab3711f2e44d Mon Sep 17 00:00:00 2001 From: Taylor Mutch Date: Fri, 29 May 2026 12:47:02 -0700 Subject: [PATCH] fix(sandbox): refresh docker/podman/vm tokens in gateway Signed-off-by: Taylor Mutch --- Cargo.lock | 1 + architecture/compute-runtimes.md | 24 +- crates/openshell-core/Cargo.toml | 4 +- crates/openshell-core/src/paths.rs | 48 ++ crates/openshell-core/src/sandbox_env.rs | 84 ++- crates/openshell-driver-docker/README.md | 2 + crates/openshell-driver-docker/src/lib.rs | 108 +++- crates/openshell-driver-docker/src/tests.rs | 25 + crates/openshell-driver-kubernetes/README.md | 6 +- .../openshell-driver-kubernetes/src/config.rs | 16 +- .../openshell-driver-kubernetes/src/driver.rs | 5 + .../openshell-driver-kubernetes/src/grpc.rs | 25 +- .../openshell-driver-kubernetes/src/main.rs | 6 +- crates/openshell-driver-podman/Cargo.toml | 2 +- crates/openshell-driver-podman/README.md | 3 + .../openshell-driver-podman/src/container.rs | 31 +- crates/openshell-driver-podman/src/driver.rs | 148 ++++- crates/openshell-driver-podman/src/grpc.rs | 47 +- crates/openshell-driver-vm/README.md | 13 +- crates/openshell-driver-vm/src/driver.rs | 310 ++++++--- crates/openshell-sandbox/src/debug_rpc.rs | 17 +- crates/openshell-sandbox/src/grpc_client.rs | 407 +++++++----- crates/openshell-sandbox/src/process.rs | 2 + .../src/supervisor_session.rs | 38 ++ crates/openshell-server/src/compute/mod.rs | 603 +++++++++++++++++- crates/openshell-server/src/config_file.rs | 4 +- crates/openshell-server/src/grpc/auth_rpc.rs | 18 +- crates/openshell-server/src/lib.rs | 30 +- .../src/supervisor_session.rs | 27 +- e2e/rust/Cargo.toml | 15 + e2e/rust/e2e-docker.sh | 5 + e2e/rust/e2e-podman.sh | 5 + e2e/rust/e2e-vm.sh | 19 +- e2e/rust/src/harness/docker.rs | 121 ++++ e2e/rust/src/harness/mod.rs | 1 + e2e/rust/tests/gateway_resume.rs | 97 +-- .../tests/gateway_resume_expired_token.rs | 105 +++ .../podman_gateway_resume_expired_token.rs | 89 +++ .../tests/vm_gateway_resume_expired_token.rs | 89 +++ e2e/support/gateway-common.sh | 14 +- proto/compute_driver.proto | 40 +- proto/openshell.proto | 36 +- 42 files changed, 2203 insertions(+), 487 deletions(-) create mode 100644 e2e/rust/src/harness/docker.rs create mode 100644 e2e/rust/tests/gateway_resume_expired_token.rs create mode 100644 e2e/rust/tests/podman_gateway_resume_expired_token.rs create mode 100644 e2e/rust/tests/vm_gateway_resume_expired_token.rs diff --git a/Cargo.lock b/Cargo.lock index 92bc18499..4a4469f64 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5884,6 +5884,7 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96374855068f47402c3121c6eed88d29cb1de8f3ab27090e273e420bdabcf050" dependencies = [ + "futures", "parking_lot", ] diff --git a/architecture/compute-runtimes.md b/architecture/compute-runtimes.md index d79b7366d..30274de08 100644 --- a/architecture/compute-runtimes.md +++ b/architecture/compute-runtimes.md @@ -29,7 +29,7 @@ reason strings. | Docker | Local development with Docker available. | Container plus nested sandbox namespace. | Uses host networking so loopback gateway endpoints work from the supervisor. | | Podman | Rootless or single-machine deployments. | Container plus nested sandbox namespace. | Uses the Podman REST API, OCI image volumes, and CDI GPU devices when available. | | Kubernetes | Cluster deployment through Helm. | Pod plus nested sandbox namespace. | Uses Kubernetes API objects, service accounts, secrets, PVC-backed workspace storage, and GPU resources. | -| VM | Experimental microVM isolation. | Per-sandbox libkrun VM. | Gateway spawns `openshell-driver-vm` as a subprocess over a private, state-local Unix socket. The VM driver boots a cached bootstrap `rootfs.ext4`, prepares requested OCI images inside a bootstrap VM with `umoci`, attaches the prepared image disk read-only, and gives each sandbox a writable `overlay.ext4` for merged-root changes and runtime material. The driver persists each accepted launch request beside the overlay and restarts those VMs on driver startup without recreating the overlay. | +| VM | Experimental microVM isolation. | Per-sandbox libkrun VM. | Gateway spawns `openshell-driver-vm` as a subprocess over a private, state-local Unix socket. The VM driver boots a cached bootstrap `rootfs.ext4`, prepares requested OCI images inside a bootstrap VM with `umoci`, attaches the prepared image disk read-only, and gives each sandbox a writable `overlay.ext4` for merged-root changes and runtime material. The driver persists each accepted launch request beside the overlay; the gateway explicitly calls the driver's resume RPC on startup so it can supply a fresh sandbox token before the VM is relaunched. | Per-sandbox CPU and memory values currently enter the driver layer through template resource limits. Docker and Podman apply them as runtime limits. @@ -64,6 +64,28 @@ Driver-controlled environment variables must override sandbox image or template values for sandbox ID, sandbox name, gateway endpoint, relay socket path, TLS paths, and command metadata. +## Sandbox Tokens + +When gateway-minted sandbox JWTs are enabled, each runtime declares its token +contract with `OPENSHELL_SANDBOX_AUTH_MODE`: + +- Docker and Podman use `gateway-managed-file`. The gateway writes host token + files that are mounted read-only into the container, and the supervisor + re-reads `OPENSHELL_SANDBOX_TOKEN_FILE` on outbound gateway calls. +- VM uses `gateway-managed-supervisor-push`. The gateway supplies a fresh token + through the driver's resume/write RPCs and sends live token updates over + `ConnectSupervisor` so the guest can rewrite + `/opt/openshell/auth/sandbox.jwt`. +- Kubernetes uses `kubernetes-service-account-exchange`. The supervisor reads + the projected ServiceAccount token from `OPENSHELL_K8S_SA_TOKEN_FILE` and + exchanges it for a gateway JWT with `IssueSandboxToken`. + +During startup, local-driver resume hooks receive a freshly minted token before +starting or re-adopting each persisted sandbox. The gateway also runs a refresh +sweep after startup resume and then rotates local-runtime tokens before expiry. +This lets a local sandbox recover after the gateway, container, or VM was +stopped long enough for the previous token to expire. + ## Images The gateway image and Helm chart are built from this repository. Sandbox images diff --git a/crates/openshell-core/Cargo.toml b/crates/openshell-core/Cargo.toml index b03fb1494..3961b4d57 100644 --- a/crates/openshell-core/Cargo.toml +++ b/crates/openshell-core/Cargo.toml @@ -20,6 +20,7 @@ serde = { workspace = true } serde_json = { workspace = true } url = { workspace = true } ipnet = "2" +tempfile = "3" [features] ## Include test-only settings (dummy_bool, dummy_int) in the registry. @@ -31,8 +32,5 @@ dev-settings = [] tonic-build = { workspace = true } protobuf-src = { workspace = true } -[dev-dependencies] -tempfile = "3" - [lints] workspace = true diff --git a/crates/openshell-core/src/paths.rs b/crates/openshell-core/src/paths.rs index 65000c6cf..cc1ffa69b 100644 --- a/crates/openshell-core/src/paths.rs +++ b/crates/openshell-core/src/paths.rs @@ -116,6 +116,38 @@ pub fn ensure_parent_dir_restricted(path: &Path) -> Result<()> { Ok(()) } +/// Atomically write a sensitive file with owner-only read/write permissions. +/// +/// The parent directory is created with [`create_dir_restricted`]. The content +/// is written to a sibling temporary file, synced, chmodded to `0o600` on Unix, +/// and then renamed into place. +pub fn write_file_owner_only_atomic(path: &Path, contents: &[u8]) -> Result<()> { + let parent = path + .parent() + .ok_or_else(|| miette::miette!("path has no parent: {}", path.display()))?; + create_dir_restricted(parent)?; + let mut temp = tempfile::Builder::new() + .prefix(".openshell-") + .tempfile_in(parent) + .into_diagnostic() + .wrap_err_with(|| format!("failed to create temp file in {}", parent.display()))?; + + std::io::Write::write_all(&mut temp, contents) + .into_diagnostic() + .wrap_err_with(|| format!("failed to write temp file for {}", path.display()))?; + temp.as_file() + .sync_all() + .into_diagnostic() + .wrap_err_with(|| format!("failed to sync temp file for {}", path.display()))?; + set_file_owner_only(temp.path())?; + temp.persist(path) + .map_err(|err| err.error) + .into_diagnostic() + .wrap_err_with(|| format!("failed to rename temp file into {}", path.display()))?; + set_file_owner_only(path)?; + Ok(()) +} + /// Check whether a file has permissions that are too open (group/other readable). /// /// Returns `true` if the file has group or other read/write/execute bits set. @@ -180,6 +212,22 @@ mod tests { assert_eq!(mode, 0o600, "expected 0600, got {mode:04o}"); } + #[test] + fn write_file_owner_only_atomic_replaces_contents() { + let tmp = tempfile::tempdir().unwrap(); + let file = tmp.path().join("nested").join("secret"); + write_file_owner_only_atomic(&file, b"first\n").unwrap(); + write_file_owner_only_atomic(&file, b"second\n").unwrap(); + + assert_eq!(std::fs::read_to_string(&file).unwrap(), "second\n"); + #[cfg(unix)] + { + use std::os::unix::fs::PermissionsExt; + let mode = std::fs::metadata(&file).unwrap().permissions().mode() & 0o777; + assert_eq!(mode, 0o600, "expected 0600, got {mode:04o}"); + } + } + #[cfg(unix)] #[test] fn is_file_permissions_too_open_detects_world_readable() { diff --git a/crates/openshell-core/src/sandbox_env.rs b/crates/openshell-core/src/sandbox_env.rs index b367e450c..0c85be515 100644 --- a/crates/openshell-core/src/sandbox_env.rs +++ b/crates/openshell-core/src/sandbox_env.rs @@ -35,21 +35,87 @@ pub const TLS_CERT: &str = "OPENSHELL_TLS_CERT"; /// Path to the private key for mTLS communication with the gateway. pub const TLS_KEY: &str = "OPENSHELL_TLS_KEY"; -/// Raw gateway-minted JWT identifying this sandbox. Mutually exclusive with -/// [`SANDBOX_TOKEN_FILE`] / [`K8S_SA_TOKEN_FILE`]; used only by test harnesses -/// that bypass the file-mount path. +/// Selects how the supervisor bootstraps sandbox authentication and who owns +/// token refresh. +pub const SANDBOX_AUTH_MODE: &str = "OPENSHELL_SANDBOX_AUTH_MODE"; + +/// Explicit sandbox authentication modes. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SandboxAuthMode { + /// Use [`SANDBOX_TOKEN`] as a static gateway JWT. + /// + /// This is intended for direct test/debug harnesses. The supervisor does + /// not refresh the token. + StaticToken, + + /// Use [`SANDBOX_TOKEN_FILE`] as a gateway-managed token file. + /// + /// Docker and Podman use this mode. The gateway refreshes the host-side + /// file and the supervisor re-reads it on outbound calls. + GatewayManagedFile, + + /// Use [`SANDBOX_TOKEN_FILE`] as a supervisor-writable token file. + /// + /// The VM driver uses this mode. The gateway injects a fresh token into + /// persisted VM state on resume and pushes live token updates over the + /// supervisor control stream. + GatewayManagedSupervisorPush, + + /// Use [`K8S_SA_TOKEN_FILE`] to exchange Kubernetes workload identity for + /// a gateway JWT. + /// + /// The supervisor re-exchanges the projected `ServiceAccount` token when + /// the gateway JWT needs rotation. + KubernetesServiceAccountExchange, +} + +impl SandboxAuthMode { + #[must_use] + pub const fn as_str(self) -> &'static str { + match self { + Self::StaticToken => "static-token", + Self::GatewayManagedFile => "gateway-managed-file", + Self::GatewayManagedSupervisorPush => "gateway-managed-supervisor-push", + Self::KubernetesServiceAccountExchange => "kubernetes-service-account-exchange", + } + } + + #[must_use] + pub fn allowed_values() -> &'static str { + "static-token, gateway-managed-file, gateway-managed-supervisor-push, kubernetes-service-account-exchange" + } +} + +impl std::str::FromStr for SandboxAuthMode { + type Err = String; + + fn from_str(value: &str) -> Result { + match value { + "static-token" => Ok(Self::StaticToken), + "gateway-managed-file" => Ok(Self::GatewayManagedFile), + "gateway-managed-supervisor-push" => Ok(Self::GatewayManagedSupervisorPush), + "kubernetes-service-account-exchange" => Ok(Self::KubernetesServiceAccountExchange), + other => Err(format!( + "invalid sandbox auth mode '{other}' (expected one of: {})", + Self::allowed_values() + )), + } + } +} + +/// Raw gateway-minted JWT identifying this sandbox. Used only when +/// [`SANDBOX_AUTH_MODE`] is [`SandboxAuthMode::StaticToken`]. pub const SANDBOX_TOKEN: &str = "OPENSHELL_SANDBOX_TOKEN"; /// Path to the file holding a gateway-minted sandbox JWT. /// -/// Set by the Docker, Podman, and VM drivers, which write the token to a -/// bundle file at sandbox-create time. Read once at supervisor startup; -/// the token is held in process memory thereafter. +/// Set by Docker, Podman, and VM when [`SANDBOX_AUTH_MODE`] is +/// [`SandboxAuthMode::GatewayManagedFile`] or +/// [`SandboxAuthMode::GatewayManagedSupervisorPush`]. pub const SANDBOX_TOKEN_FILE: &str = "OPENSHELL_SANDBOX_TOKEN_FILE"; /// Path to the projected `ServiceAccount` JWT (Kubernetes driver). /// -/// Used to bootstrap a gateway-minted JWT via `IssueSandboxToken`. Kubelet -/// writes and rotates this file; the supervisor exchanges its contents -/// for a gateway JWT at startup and on refresh. +/// Used when [`SANDBOX_AUTH_MODE`] is +/// [`SandboxAuthMode::KubernetesServiceAccountExchange`]. pub const K8S_SA_TOKEN_FILE: &str = "OPENSHELL_K8S_SA_TOKEN_FILE"; diff --git a/crates/openshell-driver-docker/README.md b/crates/openshell-driver-docker/README.md index ea57f44e4..d2be0b079 100644 --- a/crates/openshell-driver-docker/README.md +++ b/crates/openshell-driver-docker/README.md @@ -80,6 +80,8 @@ overwrites security-critical keys: - `OPENSHELL_SANDBOX` - `OPENSHELL_SSH_SOCKET_PATH` - `OPENSHELL_SANDBOX_COMMAND` +- `OPENSHELL_SANDBOX_AUTH_MODE=gateway-managed-file` and + `OPENSHELL_SANDBOX_TOKEN_FILE` when gateway JWT auth is enabled - TLS path variables when HTTPS is enabled Do not allow sandbox images or templates to override these values. diff --git a/crates/openshell-driver-docker/src/lib.rs b/crates/openshell-driver-docker/src/lib.rs index 2ce652bee..5cc76a808 100644 --- a/crates/openshell-driver-docker/src/lib.rs +++ b/crates/openshell-driver-docker/src/lib.rs @@ -32,10 +32,11 @@ use openshell_core::proto::compute::v1::{ CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, DriverCondition, DriverPlatformEvent, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate, GetCapabilitiesRequest, GetCapabilitiesResponse, GetSandboxRequest, - GetSandboxResponse, ListSandboxesRequest, ListSandboxesResponse, StopSandboxRequest, - StopSandboxResponse, ValidateSandboxCreateRequest, ValidateSandboxCreateResponse, - WatchSandboxesDeletedEvent, WatchSandboxesEvent, WatchSandboxesPlatformEvent, - WatchSandboxesRequest, WatchSandboxesSandboxEvent, compute_driver_server::ComputeDriver, + GetSandboxResponse, ListSandboxesRequest, ListSandboxesResponse, ResumeSandboxRequest, + ResumeSandboxResponse, StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateRequest, + ValidateSandboxCreateResponse, WatchSandboxesDeletedEvent, WatchSandboxesEvent, + WatchSandboxesPlatformEvent, WatchSandboxesRequest, WatchSandboxesSandboxEvent, + WriteSandboxTokenRequest, WriteSandboxTokenResponse, compute_driver_server::ComputeDriver, watch_sandboxes_event, }; use openshell_core::{Config, Error, Result as CoreResult}; @@ -62,6 +63,7 @@ const TLS_CA_MOUNT_PATH: &str = "/etc/openshell/tls/client/ca.crt"; const TLS_CERT_MOUNT_PATH: &str = "/etc/openshell/tls/client/tls.crt"; const TLS_KEY_MOUNT_PATH: &str = "/etc/openshell/tls/client/tls.key"; const SANDBOX_TOKEN_MOUNT_PATH: &str = "/etc/openshell/auth/sandbox.jwt"; +const SANDBOX_TOKEN_MOUNT_DIR: &str = "/etc/openshell/auth"; const SANDBOX_COMMAND: &str = "sleep infinity"; const SUPERVISOR_PATH: &str = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"; const HOST_OPENSHELL_INTERNAL: &str = "host.openshell.internal"; @@ -522,9 +524,8 @@ impl DockerComputeDriver { .map_err(|status| { DockerProvisioningFailure::new("ImagePullFailed", status.message()) })?; - let token_file_created = write_sandbox_token_file(sandbox, &self.config) - .await - .map_err(|status| { + let token_file_created = + write_sandbox_token_file(sandbox, &self.config).map_err(|status| { DockerProvisioningFailure::new("SandboxTokenWriteFailed", status.message()) })?; @@ -720,6 +721,7 @@ impl DockerComputeDriver { &self, sandbox_id: &str, sandbox_name: &str, + sandbox_token: Option<&str>, ) -> Result { let Some(container) = self .find_managed_container_summary(sandbox_id, sandbox_name) @@ -731,6 +733,9 @@ impl DockerComputeDriver { return Ok(false); }; let state = container.state.unwrap_or(ContainerSummaryStateEnum::EMPTY); + if let Some(token) = sandbox_token.filter(|token| !token.is_empty()) { + self.write_sandbox_token(sandbox_id, token)?; + } if !container_state_needs_resume(state) { return Ok(true); } @@ -745,6 +750,10 @@ impl DockerComputeDriver { } } + pub fn write_sandbox_token(&self, sandbox_id: &str, token: &str) -> Result<(), Status> { + write_sandbox_token_file_by_id(sandbox_id, &self.config, token) + } + pub async fn stop_managed_containers_on_shutdown(&self) -> Result { let containers = self.list_managed_container_summaries().await?; let targets = containers @@ -1212,6 +1221,37 @@ impl ComputeDriver for DockerComputeDriver { Ok(Response::new(CreateSandboxResponse {})) } + async fn resume_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + require_sandbox_identifier(&request.sandbox_id, &request.sandbox_name)?; + let resumed = Self::resume_sandbox( + self, + &request.sandbox_id, + &request.sandbox_name, + (!request.sandbox_token.is_empty()).then_some(request.sandbox_token.as_str()), + ) + .await?; + Ok(Response::new(ResumeSandboxResponse { resumed })) + } + + async fn write_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + if request.sandbox_id.is_empty() { + return Err(Status::invalid_argument("sandbox_id is required")); + } + if request.sandbox_token.is_empty() { + return Err(Status::invalid_argument("sandbox_token is required")); + } + Self::write_sandbox_token(self, &request.sandbox_id, &request.sandbox_token)?; + Ok(Response::new(WriteSandboxTokenResponse {})) + } + async fn stop_sandbox( &self, request: Request, @@ -1548,18 +1588,31 @@ fn build_binds( { binds.push(format!( "{}:{}:ro,z", - sandbox_token_host_path(sandbox, config)?.display(), - SANDBOX_TOKEN_MOUNT_PATH + sandbox_token_host_dir(sandbox, config)?.display(), + SANDBOX_TOKEN_MOUNT_DIR )); } Ok(binds) } -fn sandbox_token_host_path( +fn sandbox_token_host_dir( sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig, ) -> Result { - sandbox_token_host_path_by_id(&sandbox.id, config) + sandbox_token_host_dir_by_id(&sandbox.id, config) +} + +fn sandbox_token_host_dir_by_id( + sandbox_id: &str, + config: &DockerDriverRuntimeConfig, +) -> Result { + let path = sandbox_token_host_path_by_id(sandbox_id, config)?; + path.parent().map(PathBuf::from).ok_or_else(|| { + Status::internal(format!( + "resolve sandbox token directory failed: {} has no parent", + path.display() + )) + }) } fn sandbox_token_host_path_by_id( @@ -1578,7 +1631,7 @@ fn sandbox_token_host_path_by_id( }) } -async fn write_sandbox_token_file( +fn write_sandbox_token_file( sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig, ) -> Result { @@ -1588,17 +1641,17 @@ async fn write_sandbox_token_file( if spec.sandbox_token.is_empty() { return Ok(false); } - let path = sandbox_token_host_path(sandbox, config)?; - if let Some(parent) = path.parent() { - openshell_core::paths::create_dir_restricted(parent).map_err(|err| { - Status::internal(format!( - "create sandbox token directory {} failed: {err}", - parent.display() - )) - })?; - } - tokio::fs::write(&path, format!("{}\n", spec.sandbox_token)) - .await + write_sandbox_token_file_by_id(&sandbox.id, config, &spec.sandbox_token)?; + Ok(true) +} + +fn write_sandbox_token_file_by_id( + sandbox_id: &str, + config: &DockerDriverRuntimeConfig, + token: &str, +) -> Result<(), Status> { + let path = sandbox_token_host_path_by_id(sandbox_id, config)?; + openshell_core::paths::write_file_owner_only_atomic(&path, format!("{token}\n").as_bytes()) .map_err(|err| { Status::internal(format!( "write sandbox token file {} failed: {err}", @@ -1611,7 +1664,7 @@ async fn write_sandbox_token_file( path.display() )) })?; - Ok(true) + Ok(()) } fn cleanup_sandbox_token_file(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig) { @@ -1707,6 +1760,7 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig environment.remove(openshell_core::sandbox_env::SANDBOX_TOKEN); environment.remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE); + environment.remove(openshell_core::sandbox_env::SANDBOX_AUTH_MODE); // Gateway-minted sandbox JWT. Keep the raw bearer out of container // metadata; the supervisor reads it from this driver-owned bind mount. @@ -1717,6 +1771,12 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig openshell_core::sandbox_env::SANDBOX_TOKEN_FILE.to_string(), SANDBOX_TOKEN_MOUNT_PATH.to_string(), ); + environment.insert( + openshell_core::sandbox_env::SANDBOX_AUTH_MODE.to_string(), + openshell_core::sandbox_env::SandboxAuthMode::GatewayManagedFile + .as_str() + .to_string(), + ); } let mut pairs = environment.into_iter().collect::>(); diff --git a/crates/openshell-driver-docker/src/tests.rs b/crates/openshell-driver-docker/src/tests.rs index cbc5130a4..722e90e2f 100644 --- a/crates/openshell-driver-docker/src/tests.rs +++ b/crates/openshell-driver-docker/src/tests.rs @@ -505,6 +505,31 @@ fn build_environment_uses_token_file_without_raw_token_env() { "{}={SANDBOX_TOKEN_MOUNT_PATH}", openshell_core::sandbox_env::SANDBOX_TOKEN_FILE ))); + assert!(env.contains(&format!( + "{}={}", + openshell_core::sandbox_env::SANDBOX_AUTH_MODE, + openshell_core::sandbox_env::SandboxAuthMode::GatewayManagedFile.as_str() + ))); +} + +#[test] +fn build_binds_mounts_token_auth_directory_read_only() { + let mut sandbox = test_sandbox(); + sandbox.spec.as_mut().unwrap().sandbox_token = "secret.jwt.value".to_string(); + let config = runtime_config(); + let token_dir = sandbox_token_host_dir(&sandbox, &config).unwrap(); + + let binds = build_binds(&sandbox, &config).unwrap(); + + assert!(binds.contains(&format!( + "{}:{SANDBOX_TOKEN_MOUNT_DIR}:ro,z", + token_dir.display() + ))); + assert!( + binds + .iter() + .all(|bind| !bind.ends_with(&format!(":{SANDBOX_TOKEN_MOUNT_PATH}:ro,z"))) + ); } #[test] diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 1d45a1d83..eb93621e2 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -41,8 +41,10 @@ values must override image-provided environment variables. Sandbox pods run as `service_account_name` and keep `automountServiceAccountToken: false`. The only Kubernetes token exposed to the supervisor is an explicit, audience-bound projected token mounted at -`/var/run/secrets/openshell/token` for the one-shot `IssueSandboxToken` -bootstrap exchange. +`/var/run/secrets/openshell/token`. The driver sets +`OPENSHELL_SANDBOX_AUTH_MODE=kubernetes-service-account-exchange`, so the +supervisor exchanges that projected token for gateway JWTs with +`IssueSandboxToken`. The gateway uses the supervisor relay for connect, exec, and file sync. Sandbox pods do not need direct external ingress for SSH. diff --git a/crates/openshell-driver-kubernetes/src/config.rs b/crates/openshell-driver-kubernetes/src/config.rs index d71133465..d4937322f 100644 --- a/crates/openshell-driver-kubernetes/src/config.rs +++ b/crates/openshell-driver-kubernetes/src/config.rs @@ -75,22 +75,20 @@ pub struct KubernetesComputeConfig { pub enable_user_namespaces: bool, pub workspace_default_storage_size: String, /// Lifetime (seconds) of the projected `ServiceAccount` token kubelet - /// writes into each sandbox pod. Used only for the one-shot - /// `IssueSandboxToken` bootstrap exchange — the gateway-minted JWT - /// that follows has its own TTL set via `gateway_jwt.ttl_secs`. + /// writes into each sandbox pod. Used for `IssueSandboxToken` exchanges; + /// the gateway-minted JWT has its own TTL set via `gateway_jwt.ttl_secs`. /// - /// Kubelet enforces a minimum of 600 seconds; the supervisor uses - /// this token within a few seconds of pod start, so any value at - /// the floor is sufficient. Default 3600. + /// Kubelet enforces a minimum of 600 seconds and rotates the projected + /// token before expiry, so any value at the floor is sufficient. Default + /// 3600. pub sa_token_ttl_secs: i64, } /// Lower bound enforced by kubelet for projected SA tokens. pub const MIN_SA_TOKEN_TTL_SECS: i64 = 600; -/// Cap at 24h — operators who want longer-lived bootstrap tokens are -/// almost certainly misconfigured (the token is consumed seconds after -/// pod start). +/// Cap at 24h. Longer projected tokens are unnecessary because kubelet rotates +/// the token file and the supervisor re-exchanges it as needed. pub const MAX_SA_TOKEN_TTL_SECS: i64 = 86_400; impl Default for KubernetesComputeConfig { diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index 0a428f146..d6371f7e0 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -1527,6 +1527,11 @@ fn apply_required_env( // Projected ServiceAccount token written by kubelet (see the volume // definition in `sandbox_template_to_k8s`). The supervisor reads this // and exchanges it for a gateway-minted JWT via `IssueSandboxToken`. + upsert_env( + env, + openshell_core::sandbox_env::SANDBOX_AUTH_MODE, + openshell_core::sandbox_env::SandboxAuthMode::KubernetesServiceAccountExchange.as_str(), + ); upsert_env( env, openshell_core::sandbox_env::K8S_SA_TOKEN_FILE, diff --git a/crates/openshell-driver-kubernetes/src/grpc.rs b/crates/openshell-driver-kubernetes/src/grpc.rs index 03f8bed7f..71b224f45 100644 --- a/crates/openshell-driver-kubernetes/src/grpc.rs +++ b/crates/openshell-driver-kubernetes/src/grpc.rs @@ -7,9 +7,10 @@ use futures::{Stream, StreamExt}; use openshell_core::proto::compute::v1::{ CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, GetCapabilitiesRequest, GetCapabilitiesResponse, GetSandboxRequest, GetSandboxResponse, - ListSandboxesRequest, ListSandboxesResponse, StopSandboxRequest, StopSandboxResponse, - ValidateSandboxCreateRequest, ValidateSandboxCreateResponse, WatchSandboxesEvent, - WatchSandboxesRequest, compute_driver_server::ComputeDriver, + ListSandboxesRequest, ListSandboxesResponse, ResumeSandboxRequest, ResumeSandboxResponse, + StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateRequest, + ValidateSandboxCreateResponse, WatchSandboxesEvent, WatchSandboxesRequest, + WriteSandboxTokenRequest, WriteSandboxTokenResponse, compute_driver_server::ComputeDriver, }; use std::pin::Pin; use tonic::{Request, Response, Status}; @@ -107,6 +108,24 @@ impl ComputeDriver for ComputeDriverService { Ok(Response::new(CreateSandboxResponse {})) } + async fn resume_sandbox( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "resume sandbox is not implemented by the kubernetes compute driver", + )) + } + + async fn write_sandbox_token( + &self, + _request: Request, + ) -> Result, Status> { + Err(Status::unimplemented( + "gateway-managed sandbox token files are not implemented by the kubernetes compute driver", + )) + } + async fn stop_sandbox( &self, _request: Request, diff --git a/crates/openshell-driver-kubernetes/src/main.rs b/crates/openshell-driver-kubernetes/src/main.rs index 703659af3..434e8a109 100644 --- a/crates/openshell-driver-kubernetes/src/main.rs +++ b/crates/openshell-driver-kubernetes/src/main.rs @@ -77,9 +77,9 @@ struct Args { enable_user_namespaces: bool, /// Lifetime (seconds) of the projected `ServiceAccount` token - /// kubelet writes into each sandbox pod for the `IssueSandboxToken` - /// bootstrap exchange. Kubelet enforces a minimum of 600s; the - /// gateway clamps values outside `[600, 86400]`. Default 3600. + /// kubelet writes into each sandbox pod for `IssueSandboxToken` + /// exchanges. Kubelet enforces a minimum of 600s; the gateway clamps + /// values outside `[600, 86400]`. Default 3600. #[arg(long, env = "OPENSHELL_K8S_SA_TOKEN_TTL_SECS", default_value_t = 3600)] sa_token_ttl_secs: i64, } diff --git a/crates/openshell-driver-podman/Cargo.toml b/crates/openshell-driver-podman/Cargo.toml index 6f2963d92..c71cc2b29 100644 --- a/crates/openshell-driver-podman/Cargo.toml +++ b/crates/openshell-driver-podman/Cargo.toml @@ -35,7 +35,7 @@ thiserror = { workspace = true } miette = { workspace = true } [dev-dependencies] -temp-env = "0.3" +temp-env = { version = "0.3", features = ["async_closure"] } [lints] workspace = true diff --git a/crates/openshell-driver-podman/README.md b/crates/openshell-driver-podman/README.md index defb27b20..99ac73906 100644 --- a/crates/openshell-driver-podman/README.md +++ b/crates/openshell-driver-podman/README.md @@ -205,6 +205,7 @@ is injected as an environment variable. |---|---|---|---| | mTLS client cert/key | Bind-mounted file paths (`OPENSHELL_TLS_*` env vars point at them) | Yes (paths only) | Yes (paths only) | | Sandbox identity | Plaintext env var | Yes | Yes | +| Sandbox JWT | Read-only bind-mounted token directory | No | Yes | | gRPC endpoint | Plaintext env var, override-protected | Yes | Yes | | Supervisor relay socket path | Plaintext env var, override-protected | Yes | Yes | @@ -218,6 +219,8 @@ via sandbox templates: - `OPENSHELL_SSH_SOCKET_PATH` - `OPENSHELL_CONTAINER_IMAGE` - `OPENSHELL_SANDBOX_COMMAND` +- `OPENSHELL_SANDBOX_AUTH_MODE=gateway-managed-file` and + `OPENSHELL_SANDBOX_TOKEN_FILE` when gateway JWT auth is enabled ## Sandbox Lifecycle diff --git a/crates/openshell-driver-podman/src/container.rs b/crates/openshell-driver-podman/src/container.rs index c6b969e40..ccbd76482 100644 --- a/crates/openshell-driver-podman/src/container.rs +++ b/crates/openshell-driver-podman/src/container.rs @@ -51,6 +51,7 @@ const TLS_CA_MOUNT_PATH: &str = "/etc/openshell/tls/client/ca.crt"; const TLS_CERT_MOUNT_PATH: &str = "/etc/openshell/tls/client/tls.crt"; const TLS_KEY_MOUNT_PATH: &str = "/etc/openshell/tls/client/tls.key"; const SANDBOX_TOKEN_MOUNT_PATH: &str = "/etc/openshell/auth/sandbox.jwt"; +const SANDBOX_TOKEN_MOUNT_DIR: &str = "/etc/openshell/auth"; /// Build a Podman container name from the sandbox name. #[must_use] @@ -304,6 +305,7 @@ fn build_env( env.remove(openshell_core::sandbox_env::SANDBOX_TOKEN); env.remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE); + env.remove(openshell_core::sandbox_env::SANDBOX_AUTH_MODE); // 4. Gateway-minted sandbox JWT. Keep the raw bearer out of container // metadata; the supervisor reads it from a driver-owned bind mount. @@ -314,6 +316,12 @@ fn build_env( openshell_core::sandbox_env::SANDBOX_TOKEN_FILE.into(), SANDBOX_TOKEN_MOUNT_PATH.into(), ); + env.insert( + openshell_core::sandbox_env::SANDBOX_AUTH_MODE.into(), + openshell_core::sandbox_env::SandboxAuthMode::GatewayManagedFile + .as_str() + .into(), + ); } env @@ -395,7 +403,7 @@ pub fn build_container_spec(sandbox: &DriverSandbox, config: &PodmanComputeConfi pub fn build_container_spec_with_token( sandbox: &DriverSandbox, config: &PodmanComputeConfig, - token_host_path: Option<&std::path::Path>, + token_host_dir: Option<&std::path::Path>, ) -> Value { let image = resolve_image(sandbox, config); let name = container_name(&sandbox.name); @@ -595,7 +603,7 @@ pub fn build_container_spec_with_token( options: ro, }); } - if let Some(path) = token_host_path { + if let Some(path) = token_host_dir { let mut ro = vec!["ro".into(), "rbind".into()]; if is_selinux_enabled() { ro.push("z".into()); @@ -603,7 +611,7 @@ pub fn build_container_spec_with_token( m.push(Mount { kind: "bind".into(), source: path.display().to_string(), - destination: SANDBOX_TOKEN_MOUNT_PATH.into(), + destination: SANDBOX_TOKEN_MOUNT_DIR.into(), options: ro, }); } @@ -1183,9 +1191,9 @@ mod tests { ..Default::default() }); let config = test_config(); - let token_path = std::path::Path::new("/host/token.jwt"); + let token_dir = std::path::Path::new("/host/token-dir"); - let spec = build_container_spec_with_token(&sandbox, &config, Some(token_path)); + let spec = build_container_spec_with_token(&sandbox, &config, Some(token_dir)); let env_map = spec["env"].as_object().expect("env should be an object"); assert_eq!( @@ -1200,13 +1208,22 @@ mod tests { .and_then(|v| v.as_str()), Some("/etc/openshell/auth/sandbox.jwt") ); + assert_eq!( + env_map + .get(openshell_core::sandbox_env::SANDBOX_AUTH_MODE) + .and_then(|v| v.as_str()), + Some(openshell_core::sandbox_env::SandboxAuthMode::GatewayManagedFile.as_str()) + ); let mounts = spec["mounts"] .as_array() .expect("mounts should be an array"); assert!(mounts.iter().any(|m| { m["type"].as_str() == Some("bind") - && m["source"].as_str() == Some("/host/token.jwt") - && m["destination"].as_str() == Some("/etc/openshell/auth/sandbox.jwt") + && m["source"].as_str() == Some("/host/token-dir") + && m["destination"].as_str() == Some("/etc/openshell/auth") + && m["options"].as_array().is_some_and(|options| { + options.iter().any(|option| option.as_str() == Some("ro")) + }) })); } diff --git a/crates/openshell-driver-podman/src/driver.rs b/crates/openshell-driver-podman/src/driver.rs index eaf2218dc..8a57dfeb8 100644 --- a/crates/openshell-driver-podman/src/driver.rs +++ b/crates/openshell-driver-podman/src/driver.rs @@ -61,7 +61,17 @@ fn sandbox_token_host_path(sandbox_id: &str) -> Result Result { + let path = sandbox_token_host_path(sandbox_id)?; + path.parent().map(PathBuf::from).ok_or_else(|| { + ComputeDriverError::Message(format!( + "resolve sandbox token directory failed: {} has no parent", + path.display() + )) + }) +} + +fn write_sandbox_token_file( sandbox: &DriverSandbox, ) -> Result, ComputeDriverError> { let Some(spec) = sandbox.spec.as_ref() else { @@ -70,17 +80,13 @@ async fn write_sandbox_token_file( if spec.sandbox_token.is_empty() { return Ok(None); } - let path = sandbox_token_host_path(&sandbox.id)?; - if let Some(parent) = path.parent() { - openshell_core::paths::create_dir_restricted(parent).map_err(|err| { - ComputeDriverError::Message(format!( - "create sandbox token directory {} failed: {err}", - parent.display() - )) - })?; - } - tokio::fs::write(&path, format!("{}\n", spec.sandbox_token)) - .await + write_sandbox_token_file_by_id(&sandbox.id, &spec.sandbox_token)?; + sandbox_token_host_dir(&sandbox.id).map(Some) +} + +fn write_sandbox_token_file_by_id(sandbox_id: &str, token: &str) -> Result<(), ComputeDriverError> { + let path = sandbox_token_host_path(sandbox_id)?; + openshell_core::paths::write_file_owner_only_atomic(&path, format!("{token}\n").as_bytes()) .map_err(|err| { ComputeDriverError::Message(format!( "write sandbox token file {} failed: {err}", @@ -93,7 +99,7 @@ async fn write_sandbox_token_file( path.display() )) })?; - Ok(Some(path)) + Ok(()) } fn cleanup_sandbox_token_file(sandbox_id: &str) { @@ -355,7 +361,7 @@ impl PodmanComputeDriver { if let Err(e) = self.client.create_volume(&vol_name).await { return Err(ComputeDriverError::from(e)); } - let token_host_path = match write_sandbox_token_file(sandbox).await { + let token_host_dir = match write_sandbox_token_file(sandbox) { Ok(path) => path, Err(e) => { let _ = self.client.remove_volume(&vol_name).await; @@ -367,7 +373,7 @@ impl PodmanComputeDriver { let spec = container::build_container_spec_with_token( sandbox, &self.config, - token_host_path.as_deref(), + token_host_dir.as_deref(), ); match self.client.create_container(&spec).await { Ok(_) => {} @@ -420,6 +426,58 @@ impl PodmanComputeDriver { .map_err(ComputeDriverError::from) } + /// Start a managed sandbox container that should still be running. + pub async fn resume_sandbox( + &self, + sandbox_id: &str, + sandbox_name: &str, + sandbox_token: Option<&str>, + ) -> Result { + let name = validated_container_name(sandbox_name)?; + let inspect = match self.client.inspect_container(&name).await { + Ok(inspect) => inspect, + Err(PodmanApiError::NotFound(_)) => return Ok(false), + Err(err) => return Err(ComputeDriverError::from(err)), + }; + + match inspect.config.labels.get(LABEL_SANDBOX_ID) { + Some(label_id) if label_id == sandbox_id => {} + Some(label_id) => { + warn!( + sandbox_id = %sandbox_id, + sandbox_name = %sandbox_name, + container = %name, + label_sandbox_id = %label_id, + "Podman container label sandbox ID did not match startup resume request" + ); + return Ok(false); + } + None => return Ok(false), + } + + if let Some(token) = sandbox_token.filter(|token| !token.is_empty()) { + self.write_sandbox_token(sandbox_id, token)?; + } + + if inspect.state.running { + return Ok(true); + } + + self.client + .start_container(&name) + .await + .map_err(ComputeDriverError::from)?; + Ok(true) + } + + pub fn write_sandbox_token( + &self, + sandbox_id: &str, + token: &str, + ) -> Result<(), ComputeDriverError> { + write_sandbox_token_file_by_id(sandbox_id, token) + } + /// Delete a sandbox container and its workspace volume. pub async fn delete_sandbox( &self, @@ -885,4 +943,64 @@ mod tests { ); let _ = std::fs::remove_file(socket_path); } + + #[tokio::test] + async fn resume_sandbox_rewrites_token_for_running_container() { + let sandbox_id = "sandbox-resume-running"; + let sandbox_name = "demo"; + let container_name = container::container_name(sandbox_name); + let inspect_body = serde_json::json!({ + "Id": "container-id", + "Name": format!("/{container_name}"), + "State": { + "Status": "running", + "Running": true + }, + "Config": { + "Labels": { + LABEL_SANDBOX_ID: sandbox_id + } + } + }) + .to_string(); + let (socket_path, request_log, handle) = spawn_podman_stub( + "resume-running-token", + vec![StubResponse::new(StatusCode::OK, inspect_body)], + ); + let driver = test_driver(socket_path.clone()); + let state_dir = socket_path.with_extension("state"); + let state_dir_string = state_dir.to_string_lossy().to_string(); + + let resumed = temp_env::async_with_vars( + [("XDG_STATE_HOME", Some(state_dir_string.as_str()))], + driver.resume_sandbox(sandbox_id, sandbox_name, Some("fresh-token")), + ) + .await + .expect("resume should succeed"); + + assert!(resumed, "running container should report resumed"); + handle.await.expect("stub task should finish"); + let requests = request_log + .lock() + .expect("request log lock should not be poisoned") + .clone(); + assert_eq!( + requests, + vec![format!( + "GET {}", + api_path(&format!("/libpod/containers/{container_name}/json")) + )] + ); + let token_path = state_dir + .join("openshell") + .join("podman-sandbox-tokens") + .join(sandbox_id) + .join("sandbox.jwt"); + assert_eq!( + std::fs::read_to_string(&token_path).expect("token file should exist"), + "fresh-token\n" + ); + let _ = std::fs::remove_dir_all(state_dir); + let _ = std::fs::remove_file(socket_path); + } } diff --git a/crates/openshell-driver-podman/src/grpc.rs b/crates/openshell-driver-podman/src/grpc.rs index 0c6015776..28b8ec44e 100644 --- a/crates/openshell-driver-podman/src/grpc.rs +++ b/crates/openshell-driver-podman/src/grpc.rs @@ -7,9 +7,10 @@ use futures::{Stream, StreamExt}; use openshell_core::proto::compute::v1::{ CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, GetCapabilitiesRequest, GetCapabilitiesResponse, GetSandboxRequest, GetSandboxResponse, - ListSandboxesRequest, ListSandboxesResponse, StopSandboxRequest, StopSandboxResponse, - ValidateSandboxCreateRequest, ValidateSandboxCreateResponse, WatchSandboxesEvent, - WatchSandboxesRequest, compute_driver_server::ComputeDriver, + ListSandboxesRequest, ListSandboxesResponse, ResumeSandboxRequest, ResumeSandboxResponse, + StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateRequest, + ValidateSandboxCreateResponse, WatchSandboxesEvent, WatchSandboxesRequest, + WriteSandboxTokenRequest, WriteSandboxTokenResponse, compute_driver_server::ComputeDriver, }; use std::pin::Pin; use tonic::{Request, Response, Status}; @@ -104,6 +105,46 @@ impl ComputeDriver for ComputeDriverService { Ok(Response::new(CreateSandboxResponse {})) } + async fn resume_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + if request.sandbox_id.is_empty() { + return Err(Status::invalid_argument("sandbox_id is required")); + } + if request.sandbox_name.is_empty() { + return Err(Status::invalid_argument("sandbox_name is required")); + } + let resumed = self + .driver + .resume_sandbox( + &request.sandbox_id, + &request.sandbox_name, + (!request.sandbox_token.is_empty()).then_some(request.sandbox_token.as_str()), + ) + .await + .map_err(Status::from)?; + Ok(Response::new(ResumeSandboxResponse { resumed })) + } + + async fn write_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + if request.sandbox_id.is_empty() { + return Err(Status::invalid_argument("sandbox_id is required")); + } + if request.sandbox_token.is_empty() { + return Err(Status::invalid_argument("sandbox_token is required")); + } + self.driver + .write_sandbox_token(&request.sandbox_id, &request.sandbox_token) + .map_err(Status::from)?; + Ok(Response::new(WriteSandboxTokenResponse {})) + } + async fn stop_sandbox( &self, request: Request, diff --git a/crates/openshell-driver-vm/README.md b/crates/openshell-driver-vm/README.md index 724bde06c..91469b57a 100644 --- a/crates/openshell-driver-vm/README.md +++ b/crates/openshell-driver-vm/README.md @@ -194,9 +194,16 @@ during the first prepare. The driver also writes the accepted `DriverSandbox` launch request to `/sandboxes//sandbox.pb`. If the gateway restarts, it starts a -new VM driver process; that process scans the sandbox state directories, -restarts each persisted VM launcher, and preserves any existing `overlay.ext4` -instead of cloning a fresh overlay template. If a restart happened before the +new VM driver process and then calls the driver's `ResumeSandbox` RPC for each +gateway-persisted sandbox that should still be running. The resume request +includes a freshly minted sandbox JWT, so the driver updates `sandbox.pb` before +restarting the VM launcher and preserves any existing `overlay.ext4` instead of +cloning a fresh overlay template. While the VM is running, the gateway rotates +tokens through the driver and sends live token updates over `ConnectSupervisor`. +The VM supervisor runs with +`OPENSHELL_SANDBOX_AUTH_MODE=gateway-managed-supervisor-push`, so it treats +`/opt/openshell/auth/sandbox.jwt` as a gateway-owned token file and rewrites it +only when the gateway pushes a fresh token. If a restart happened before the overlay was created, the driver creates it during the resume attempt. ## Logs and debugging diff --git a/crates/openshell-driver-vm/src/driver.rs b/crates/openshell-driver-vm/src/driver.rs index 405bf226d..719bed2a2 100644 --- a/crates/openshell-driver-vm/src/driver.rs +++ b/crates/openshell-driver-vm/src/driver.rs @@ -31,11 +31,13 @@ use openshell_core::progress::{ use openshell_core::proto::compute::v1::{ CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse, DriverCondition as SandboxCondition, DriverPlatformEvent as PlatformEvent, - DriverSandbox as Sandbox, DriverSandboxStatus as SandboxStatus, GetCapabilitiesRequest, - GetCapabilitiesResponse, GetSandboxRequest, GetSandboxResponse, ListSandboxesRequest, - ListSandboxesResponse, StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateRequest, - ValidateSandboxCreateResponse, WatchSandboxesDeletedEvent, WatchSandboxesEvent, - WatchSandboxesPlatformEvent, WatchSandboxesRequest, WatchSandboxesSandboxEvent, + DriverSandbox as Sandbox, DriverSandboxSpec as SandboxSpec, + DriverSandboxStatus as SandboxStatus, GetCapabilitiesRequest, GetCapabilitiesResponse, + GetSandboxRequest, GetSandboxResponse, ListSandboxesRequest, ListSandboxesResponse, + ResumeSandboxRequest, ResumeSandboxResponse, StopSandboxRequest, StopSandboxResponse, + ValidateSandboxCreateRequest, ValidateSandboxCreateResponse, WatchSandboxesDeletedEvent, + WatchSandboxesEvent, WatchSandboxesPlatformEvent, WatchSandboxesRequest, + WatchSandboxesSandboxEvent, WriteSandboxTokenRequest, WriteSandboxTokenResponse, compute_driver_server::ComputeDriver, watch_sandboxes_event, }; use openshell_vfio::SysfsRoot; @@ -367,7 +369,6 @@ impl VmDriver { gpu_inventory, subnet_allocator, }; - driver.restore_persisted_sandboxes().await; Ok(driver) } @@ -509,6 +510,121 @@ impl VmDriver { Ok(CreateSandboxResponse {}) } + #[allow(clippy::result_large_err)] + pub async fn resume_sandbox( + &self, + sandbox_id: &str, + sandbox_name: &str, + sandbox_token: Option<&str>, + ) -> Result { + validate_sandbox_id(sandbox_id)?; + let state_dir = { + let registry = self.registry.lock().await; + registry.get(sandbox_id).map_or_else( + || sandboxes_root_dir(&self.config.state_dir).join(sandbox_id), + |record| record.state_dir.clone(), + ) + }; + + let metadata = match tokio::fs::metadata(&state_dir).await { + Ok(metadata) => metadata, + Err(err) if err.kind() == std::io::ErrorKind::NotFound => return Ok(false), + Err(err) => { + return Err(Status::internal(format!( + "stat persisted vm sandbox state dir {}: {err}", + state_dir.display() + ))); + } + }; + if !metadata.is_dir() { + return Err(Status::internal(format!( + "persisted vm sandbox state path is not a directory: {}", + state_dir.display() + ))); + } + + let mut sandbox = read_sandbox_request(&state_dir.join(SANDBOX_REQUEST_FILE)) + .await + .map_err(|err| { + Status::internal(format!( + "read persisted vm sandbox request {}: {err}", + state_dir.join(SANDBOX_REQUEST_FILE).display() + )) + })?; + if sandbox.id != sandbox_id { + return Err(Status::failed_precondition(format!( + "persisted sandbox id '{}' does not match requested id '{}'", + sandbox.id, sandbox_id + ))); + } + if !sandbox_name.is_empty() && sandbox.name != sandbox_name { + return Err(Status::failed_precondition(format!( + "persisted sandbox name '{}' does not match requested name '{}'", + sandbox.name, sandbox_name + ))); + } + validate_restored_sandbox_state(&self.config.state_dir, &state_dir, &sandbox)?; + + if let Some(token) = sandbox_token { + set_sandbox_token(&mut sandbox, token); + write_sandbox_request(&state_dir, &sandbox) + .await + .map_err(|err| { + Status::internal(format!( + "write refreshed vm sandbox request {}: {err}", + state_dir.join(SANDBOX_REQUEST_FILE).display() + )) + })?; + } + + if self.registry.lock().await.contains_key(sandbox_id) { + return Ok(true); + } + + self.restore_persisted_sandbox(sandbox, state_dir).await?; + Ok(true) + } + + #[allow(clippy::result_large_err)] + pub async fn write_sandbox_token( + &self, + sandbox_id: &str, + sandbox_token: &str, + ) -> Result<(), Status> { + validate_sandbox_id(sandbox_id)?; + if sandbox_token.is_empty() { + return Err(Status::invalid_argument("sandbox_token is required")); + } + let state_dir = { + let registry = self.registry.lock().await; + registry.get(sandbox_id).map_or_else( + || sandboxes_root_dir(&self.config.state_dir).join(sandbox_id), + |record| record.state_dir.clone(), + ) + }; + let request_path = state_dir.join(SANDBOX_REQUEST_FILE); + let mut sandbox = read_sandbox_request(&request_path).await.map_err(|err| { + if err.kind() == std::io::ErrorKind::NotFound { + Status::not_found("persisted vm sandbox request not found") + } else { + Status::internal(format!( + "read persisted vm sandbox request {}: {err}", + request_path.display() + )) + } + })?; + validate_restored_sandbox_state(&self.config.state_dir, &state_dir, &sandbox)?; + set_sandbox_token(&mut sandbox, sandbox_token); + write_sandbox_request(&state_dir, &sandbox) + .await + .map_err(|err| { + Status::internal(format!( + "write refreshed vm sandbox request {}: {err}", + request_path.display() + )) + }) + } + async fn provision_sandbox( &self, sandbox: Sandbox, @@ -902,88 +1018,20 @@ impl VmDriver { snapshots } - async fn restore_persisted_sandboxes(&self) { - let state_root = sandboxes_root_dir(&self.config.state_dir); - let mut entries = match tokio::fs::read_dir(&state_root).await { - Ok(entries) => entries, - Err(err) if err.kind() == std::io::ErrorKind::NotFound => return, - Err(err) => { - warn!( - state_root = %state_root.display(), - error = %err, - "vm driver: failed to scan persisted sandboxes" - ); - return; - } - }; - - loop { - let entry = match entries.next_entry().await { - Ok(Some(entry)) => entry, - Ok(None) => break, - Err(err) => { - warn!( - state_root = %state_root.display(), - error = %err, - "vm driver: failed to continue scanning persisted sandboxes" - ); - break; - } - }; - let state_dir = entry.path(); - let is_dir = match entry.file_type().await { - Ok(file_type) => file_type.is_dir(), - Err(err) => { - warn!( - state_dir = %state_dir.display(), - error = %err, - "vm driver: failed to inspect persisted sandbox state dir" - ); - continue; - } - }; - if !is_dir { - continue; - } - - let request_path = state_dir.join(SANDBOX_REQUEST_FILE); - let sandbox = match read_sandbox_request(&request_path).await { - Ok(sandbox) => sandbox, - Err(err) if err.kind() == std::io::ErrorKind::NotFound => continue, - Err(err) => { - warn!( - state_dir = %state_dir.display(), - error = %err, - "vm driver: failed to read persisted sandbox request" - ); - continue; - } - }; - - if let Err(status) = - validate_restored_sandbox_state(&self.config.state_dir, &state_dir, &sandbox) - { - warn!( - sandbox_id = %sandbox.id, - state_dir = %state_dir.display(), - error = %status.message(), - "vm driver: ignoring invalid persisted sandbox state" - ); - continue; - } - - self.restore_persisted_sandbox(sandbox, state_dir).await; - } - } - - async fn restore_persisted_sandbox(&self, sandbox: Sandbox, state_dir: PathBuf) { + async fn restore_persisted_sandbox( + &self, + sandbox: Sandbox, + state_dir: PathBuf, + ) -> Result<(), Status> { let Some(image_ref) = self.resolved_sandbox_image(&sandbox) else { warn!( sandbox_id = %sandbox.id, sandbox_name = %sandbox.name, "vm driver: cannot restore persisted sandbox without image" ); - return; + return Err(Status::failed_precondition( + "vm sandbox cannot be restored without an image", + )); }; let tls_paths = match self.config.tls_paths() { Ok(paths) => paths, @@ -994,7 +1042,7 @@ impl VmDriver { error = %err, "vm driver: cannot restore persisted sandbox TLS configuration" ); - return; + return Err(Status::failed_precondition(err)); } }; @@ -1002,7 +1050,7 @@ impl VmDriver { { let mut registry = self.registry.lock().await; if registry.contains_key(&sandbox.id) { - return; + return Ok(()); } registry.insert( sandbox.id.clone(), @@ -1052,6 +1100,7 @@ impl VmDriver { } else { task.abort(); } + Ok(()) } fn release_gpu_and_subnet(&self, sandbox_id: &str) { @@ -2452,6 +2501,30 @@ impl ComputeDriver for VmDriver { Ok(Response::new(response)) } + async fn resume_sandbox( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + let resumed = Self::resume_sandbox( + self, + &request.sandbox_id, + &request.sandbox_name, + (!request.sandbox_token.is_empty()).then_some(request.sandbox_token.as_str()), + ) + .await?; + Ok(Response::new(ResumeSandboxResponse { resumed })) + } + + async fn write_sandbox_token( + &self, + request: Request, + ) -> Result, Status> { + let request = request.into_inner(); + Self::write_sandbox_token(self, &request.sandbox_id, &request.sandbox_token).await?; + Ok(Response::new(WriteSandboxTokenResponse {})) + } + async fn get_sandbox( &self, request: Request, @@ -3508,6 +3581,7 @@ fn build_guest_environment( environment.extend(merged_environment(sandbox)); environment.remove(openshell_core::sandbox_env::SANDBOX_TOKEN); environment.remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE); + environment.remove(openshell_core::sandbox_env::SANDBOX_AUTH_MODE); if sandbox .spec .as_ref() @@ -3517,6 +3591,12 @@ fn build_guest_environment( openshell_core::sandbox_env::SANDBOX_TOKEN_FILE.to_string(), GUEST_SANDBOX_TOKEN_PATH.to_string(), ); + environment.insert( + openshell_core::sandbox_env::SANDBOX_AUTH_MODE.to_string(), + openshell_core::sandbox_env::SandboxAuthMode::GatewayManagedSupervisorPush + .as_str() + .to_string(), + ); } let mut pairs = environment.into_iter().collect::>(); @@ -3808,6 +3888,11 @@ async fn read_sandbox_request(path: &Path) -> Result { }) } +fn set_sandbox_token(sandbox: &mut Sandbox, token: &str) { + let spec = sandbox.spec.get_or_insert_with(SandboxSpec::default); + spec.sandbox_token = token.to_string(); +} + async fn write_private_file(path: &Path, bytes: Vec) -> Result<(), std::io::Error> { tokio::fs::write(path, bytes).await?; restrict_owner_read_write(path).await @@ -4732,6 +4817,58 @@ mod tests { let _ = std::fs::remove_dir_all(base); } + #[tokio::test] + async fn write_sandbox_token_updates_persisted_request() { + let base = unique_temp_dir(); + let driver_state = base.join("driver-state"); + let state_dir = sandbox_state_dir(&driver_state, "sandbox-123").unwrap(); + std::fs::create_dir_all(&state_dir).unwrap(); + let sandbox = Sandbox { + id: "sandbox-123".to_string(), + name: "sandbox-123".to_string(), + spec: Some(SandboxSpec { + sandbox_token: "old-token".to_string(), + ..Default::default() + }), + ..Default::default() + }; + write_sandbox_request(&state_dir, &sandbox) + .await + .expect("write persisted sandbox request"); + let driver = VmDriver { + config: VmDriverConfig { + state_dir: driver_state, + ..Default::default() + }, + launcher_bin: PathBuf::from("openshell-driver-vm"), + registry: Arc::new(Mutex::new(HashMap::new())), + image_cache_lock: Arc::new(Mutex::new(())), + events: broadcast::channel(WATCH_BUFFER).0, + gpu_inventory: None, + subnet_allocator: Arc::new(std::sync::Mutex::new(SubnetAllocator::new( + Ipv4Addr::new(10, 0, 128, 0), + 17, + ))), + }; + + driver + .write_sandbox_token("sandbox-123", "fresh-token") + .await + .expect("write sandbox token"); + + let restored = read_sandbox_request(&state_dir.join(SANDBOX_REQUEST_FILE)) + .await + .expect("read persisted sandbox request"); + assert_eq!( + restored + .spec + .as_ref() + .map(|spec| spec.sandbox_token.as_str()), + Some("fresh-token") + ); + let _ = std::fs::remove_dir_all(base); + } + #[test] fn prepare_sandbox_overlay_preserves_existing_overlay_on_resume() { let base = unique_temp_dir(); @@ -5044,6 +5181,11 @@ mod tests { "{}={GUEST_SANDBOX_TOKEN_PATH}", openshell_core::sandbox_env::SANDBOX_TOKEN_FILE ))); + assert!(env.contains(&format!( + "{}={}", + openshell_core::sandbox_env::SANDBOX_AUTH_MODE, + openshell_core::sandbox_env::SandboxAuthMode::GatewayManagedSupervisorPush.as_str() + ))); } #[test] diff --git a/crates/openshell-sandbox/src/debug_rpc.rs b/crates/openshell-sandbox/src/debug_rpc.rs index af22b7450..5767c4928 100644 --- a/crates/openshell-sandbox/src/debug_rpc.rs +++ b/crates/openshell-sandbox/src/debug_rpc.rs @@ -11,7 +11,7 @@ //! //! Subcommands: //! - `get-sandbox-config --sandbox-id ` — call `GetSandboxConfig` -//! - `refresh` — call `RefreshSandboxToken` +//! - `refresh` — manually call the legacy `RefreshSandboxToken` RPC //! - `show-token` — print a token fingerprint and expiry, never the bearer //! - `show-principal` — pretty-print the decoded JWT claims //! (no signature verification — the supervisor already trusts the @@ -54,13 +54,12 @@ usage: openshell-sandbox debug-rpc [options] commands: get-sandbox-config --sandbox-id call GetSandboxConfig - refresh renew the gateway JWT + refresh manually call RefreshSandboxToken show-token print JWT fingerprint and expiry show-principal print decoded JWT claims -requires: OPENSHELL_ENDPOINT in env, plus one of OPENSHELL_SANDBOX_TOKEN, -OPENSHELL_SANDBOX_TOKEN_FILE, or OPENSHELL_K8S_SA_TOKEN_FILE so the -supervisor's normal token-acquisition path can resolve a JWT."; +requires: OPENSHELL_ENDPOINT and OPENSHELL_SANDBOX_AUTH_MODE in env, plus +the token variable required by that auth mode."; async fn open_client() -> Result> { let endpoint = std::env::var(openshell_core::sandbox_env::ENDPOINT) @@ -183,9 +182,9 @@ fn token_fingerprint(token: &str) -> String { format!("sha256:{}", &hex::encode(digest)[..16]) } -/// Read the token from the env/file/SA-bootstrap chain, but only the -/// "already a gateway JWT" paths — show-token / show-principal don't -/// want to actually exchange an SA token. +/// Read the token from the static or file-backed auth modes, but only the +/// "already a gateway JWT" paths — show-token / show-principal don't want to +/// actually exchange a Kubernetes `ServiceAccount` token. fn read_local_token() -> Result { if let Ok(t) = std::env::var(openshell_core::sandbox_env::SANDBOX_TOKEN) && !t.is_empty() @@ -203,7 +202,7 @@ fn read_local_token() -> Result { } Err(miette::miette!( "no in-process gateway JWT available — set OPENSHELL_SANDBOX_TOKEN or \ - OPENSHELL_SANDBOX_TOKEN_FILE. The K8s SA-bootstrap path is intentionally \ + OPENSHELL_SANDBOX_TOKEN_FILE. The K8s ServiceAccount exchange path is intentionally \ excluded from `show-token` / `show-principal` to avoid issuing a fresh \ token just for inspection." )) diff --git a/crates/openshell-sandbox/src/grpc_client.rs b/crates/openshell-sandbox/src/grpc_client.rs index 14a6808c1..e78c29a33 100644 --- a/crates/openshell-sandbox/src/grpc_client.rs +++ b/crates/openshell-sandbox/src/grpc_client.rs @@ -5,20 +5,20 @@ //! route bundles from `OpenShell` server. //! //! Every request carries a gateway-minted JWT in the `Authorization` header. -//! The token is resolved at startup from one of three sources: +//! The token is resolved at startup from the explicit +//! `OPENSHELL_SANDBOX_AUTH_MODE`: //! -//! 1. `OPENSHELL_SANDBOX_TOKEN` — raw JWT in the env (test harness path). -//! 2. `OPENSHELL_SANDBOX_TOKEN_FILE` — file containing the JWT (Docker / -//! Podman / VM drivers write this to a bundle file at sandbox-create -//! time). -//! 3. `OPENSHELL_K8S_SA_TOKEN_FILE` — projected `ServiceAccount` JWT; the -//! supervisor exchanges it for a gateway JWT via `IssueSandboxToken` -//! once at startup. -//! -//! The resolved gateway JWT is held in process memory thereafter and -//! injected on every outbound call by [`AuthInterceptor`]. +//! - `static-token` reads `OPENSHELL_SANDBOX_TOKEN` and never refreshes it. +//! - `gateway-managed-file` reads `OPENSHELL_SANDBOX_TOKEN_FILE`; Docker and +//! Podman refresh that host-side file from the gateway. +//! - `gateway-managed-supervisor-push` reads `OPENSHELL_SANDBOX_TOKEN_FILE`; +//! VM refreshes that file through gateway control-stream pushes. +//! - `kubernetes-service-account-exchange` reads +//! `OPENSHELL_K8S_SA_TOKEN_FILE` and exchanges it for a gateway JWT via +//! `IssueSandboxToken`. use std::collections::HashMap; +use std::path::{Path, PathBuf}; use std::sync::{Arc, OnceLock, RwLock}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; @@ -26,11 +26,12 @@ use miette::{IntoDiagnostic, Result, WrapErr}; use openshell_core::proto::{ DenialSummary, GetDraftPolicyRequest, GetInferenceBundleRequest, GetInferenceBundleResponse, GetSandboxConfigRequest, GetSandboxProviderEnvironmentRequest, IssueSandboxTokenRequest, - PolicyChunk, PolicySource, PolicyStatus, RefreshSandboxTokenRequest, ReportPolicyStatusRequest, + PolicyChunk, PolicySource, PolicyStatus, ReportPolicyStatusRequest, SandboxPolicy as ProtoSandboxPolicy, SubmitPolicyAnalysisRequest, SubmitPolicyAnalysisResponse, UpdateConfigRequest, inference_client::InferenceClient, open_shell_client::OpenShellClient, }; use openshell_core::sandbox_env; +use openshell_core::sandbox_env::SandboxAuthMode; use tonic::Status; use tonic::metadata::AsciiMetadataValue; use tonic::service::interceptor::InterceptedService; @@ -41,16 +42,28 @@ use tracing::{debug, info, warn}; /// generated client type signatures stay readable. pub type AuthedChannel = InterceptedService; -/// Shared, refreshable Bearer header. All [`AuthInterceptor`] clones read -/// the same slot, so the renewal task can replace the token in place without +/// Shared Bearer header. All [`AuthInterceptor`] clones read the same slot, so +/// Kubernetes token re-exchange can replace the token in place without /// rebuilding the channel. type TokenSlot = Arc>; -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq)] enum TokenSource { - Env, - File, - K8sServiceAccount, + StaticToken, + GatewayManagedFile { path: PathBuf }, + GatewayManagedSupervisorPush { path: PathBuf }, + K8sServiceAccount { path: PathBuf }, +} + +impl TokenSource { + fn token_file_path(&self) -> Option<&Path> { + match self { + Self::GatewayManagedFile { path } | Self::GatewayManagedSupervisorPush { path } => { + Some(path) + } + Self::StaticToken | Self::K8sServiceAccount { .. } => None, + } + } } #[derive(Debug)] @@ -60,7 +73,8 @@ struct AcquiredToken { } /// Process-wide token slot. Initialized by the first [`connect_channel`] -/// call and shared with every subsequent client and the renewal loop. +/// call and shared with every subsequent client and the Kubernetes exchange +/// loop, when that auth mode is active. static TOKEN_SLOT: OnceLock = OnceLock::new(); /// Source used to acquire the process-wide token slot. @@ -68,16 +82,21 @@ static TOKEN_SOURCE: OnceLock = OnceLock::new(); /// Serializes the first token acquisition. Several supervisor subsystems /// connect during startup; without this guard they can all observe an empty -/// [`TOKEN_SLOT`] and perform duplicate K8s bootstrap exchanges. +/// [`TOKEN_SLOT`] and perform duplicate Kubernetes `ServiceAccount` exchanges. static TOKEN_INIT_LOCK: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(()); -/// One-shot guard so the renewal loop spawns at most once per process. +/// One-shot guard so the Kubernetes exchange loop spawns at most once per +/// process. static REFRESH_SPAWNED: OnceLock<()> = OnceLock::new(); -fn install_token_slot(token: &str) -> Result { - let bearer = AsciiMetadataValue::try_from(format!("Bearer {token}")) +fn bearer_value(token: &str) -> Result { + AsciiMetadataValue::try_from(format!("Bearer {token}")) .into_diagnostic() - .wrap_err("sandbox JWT contained characters not valid for a header value")?; + .wrap_err("sandbox JWT contained characters not valid for a header value") +} + +fn install_token_slot(token: &str) -> Result { + let bearer = bearer_value(token)?; if let Some(existing) = TOKEN_SLOT.get() { *existing.write().expect("token slot poisoned") = bearer; return Ok(existing.clone()); @@ -88,16 +107,46 @@ fn install_token_slot(token: &str) -> Result { } /// gRPC interceptor that injects `authorization: Bearer ` on every -/// outbound request. The token lives in a shared [`TokenSlot`] so the renewal -/// task can replace it without rebuilding clients. +/// outbound request. The token lives in a shared [`TokenSlot`] so Kubernetes +/// token re-exchange can replace it without rebuilding clients. #[derive(Clone)] pub struct AuthInterceptor { bearer: TokenSlot, + source: TokenSource, } impl AuthInterceptor { - fn new(bearer: TokenSlot) -> Self { - Self { bearer } + fn new(bearer: TokenSlot, source: TokenSource) -> Self { + Self { bearer, source } + } + + fn current_bearer(&self) -> AsciiMetadataValue { + if let Some(path) = self.source.token_file_path() { + match read_sandbox_token_file(path).and_then(|token| bearer_value(&token)) { + Ok(value) => { + if let Ok(mut guard) = self.bearer.write() + && *guard != value + { + *guard = value.clone(); + debug!( + path = %path.display(), + "loaded rotated sandbox token from file" + ); + } + return value; + } + Err(err) => warn!( + path = %path.display(), + error = %err, + "failed to reload sandbox token file; using cached token" + ), + } + } + + self.bearer + .read() + .expect("auth interceptor token slot poisoned") + .clone() } } @@ -106,12 +155,8 @@ impl tonic::service::Interceptor for AuthInterceptor { &mut self, mut req: tonic::Request<()>, ) -> std::result::Result, Status> { - let bearer = self - .bearer - .read() - .expect("auth interceptor token slot poisoned") - .clone(); - req.metadata_mut().insert("authorization", bearer); + req.metadata_mut() + .insert("authorization", self.current_bearer()); Ok(req) } } @@ -179,23 +224,24 @@ async fn build_plain_channel(endpoint: &str) -> Result { /// Build a Bearer-authenticated channel to the gateway. /// -/// First call per process resolves the sandbox JWT via the three-step -/// lookup (env → file → K8s SA bootstrap exchange) and installs it into -/// the process-wide [`TOKEN_SLOT`]. Subsequent calls reuse the cached -/// slot — the renewal loop keeps the value fresh, so re-running the -/// bootstrap is both unnecessary and (on the K8s SA path) expensive -/// (one apiserver round-trip per call). The renewal loop itself is -/// spawned once per process via [`REFRESH_SPAWNED`]. +/// First call per process resolves the sandbox JWT from +/// `OPENSHELL_SANDBOX_AUTH_MODE` and installs it into the process-wide +/// [`TOKEN_SLOT`]. Subsequent calls reuse the cached slot. For Kubernetes +/// service-account exchange mode, the refresh loop is spawned once per process +/// via [`REFRESH_SPAWNED`]. async fn connect_channel(endpoint: &str) -> Result { let channel = build_plain_channel(endpoint).await?; let (slot, source) = token_slot(endpoint, &channel).await?; let plain_channel = channel.clone(); - let intercepted = InterceptedService::new(channel, AuthInterceptor::new(slot.clone())); - if REFRESH_SPAWNED.set(()).is_ok() { - let refresh_channel = intercepted.clone(); + let intercepted = + InterceptedService::new(channel, AuthInterceptor::new(slot.clone(), source.clone())); + if let TokenSource::K8sServiceAccount { path } = &source + && REFRESH_SPAWNED.set(()).is_ok() + { let endpoint = endpoint.to_string(); + let path = path.clone(); tokio::spawn(async move { - refresh_token_loop(refresh_channel, slot, source, endpoint, plain_channel).await; + refresh_k8s_exchange_loop(slot, endpoint, plain_channel, path).await; }); } Ok(intercepted) @@ -203,20 +249,26 @@ async fn connect_channel(endpoint: &str) -> Result { async fn token_slot(endpoint: &str, plain_channel: &Channel) -> Result<(TokenSlot, TokenSource)> { if let Some(existing) = TOKEN_SLOT.get() { - let source = TOKEN_SOURCE.get().copied().unwrap_or(TokenSource::Env); + let source = TOKEN_SOURCE + .get() + .cloned() + .unwrap_or(TokenSource::StaticToken); return Ok((existing.clone(), source)); } let _guard = TOKEN_INIT_LOCK.lock().await; if let Some(existing) = TOKEN_SLOT.get() { - let source = TOKEN_SOURCE.get().copied().unwrap_or(TokenSource::Env); + let source = TOKEN_SOURCE + .get() + .cloned() + .unwrap_or(TokenSource::StaticToken); return Ok((existing.clone(), source)); } let acquired = acquire_sandbox_token(endpoint, plain_channel).await?; let slot = install_token_slot(&acquired.token)?; - let _ = TOKEN_SOURCE.set(acquired.source); + let _ = TOKEN_SOURCE.set(acquired.source.clone()); Ok((slot, acquired.source)) } @@ -227,58 +279,94 @@ async fn token_slot(endpoint: &str, plain_channel: &Channel) -> Result<(TokenSlo /// bootstrap path, which uses `plain_channel` to call `IssueSandboxToken` /// once before the steady-state Bearer-authenticated channel is built. async fn acquire_sandbox_token(endpoint: &str, plain_channel: &Channel) -> Result { - if let Ok(t) = std::env::var(sandbox_env::SANDBOX_TOKEN) - && !t.is_empty() - { - debug!(source = "env", "loaded sandbox token"); - return Ok(AcquiredToken { - token: t, - source: TokenSource::Env, - }); + let auth_mode = sandbox_auth_mode()?; + + match auth_mode { + SandboxAuthMode::StaticToken => { + let token = required_env(sandbox_env::SANDBOX_TOKEN, auth_mode)?; + debug!( + source = "env", + mode = auth_mode.as_str(), + "loaded sandbox token" + ); + Ok(AcquiredToken { + token, + source: TokenSource::StaticToken, + }) + } + SandboxAuthMode::GatewayManagedFile => { + let path = required_path_env(sandbox_env::SANDBOX_TOKEN_FILE, auth_mode)?; + let token = read_sandbox_token_file(&path)?; + debug!(source = "file", mode = auth_mode.as_str(), path = %path.display(), "loaded sandbox token"); + Ok(AcquiredToken { + token, + source: TokenSource::GatewayManagedFile { path }, + }) + } + SandboxAuthMode::GatewayManagedSupervisorPush => { + let path = required_path_env(sandbox_env::SANDBOX_TOKEN_FILE, auth_mode)?; + let token = read_sandbox_token_file(&path)?; + debug!(source = "file", mode = auth_mode.as_str(), path = %path.display(), "loaded sandbox token"); + Ok(AcquiredToken { + token, + source: TokenSource::GatewayManagedSupervisorPush { path }, + }) + } + SandboxAuthMode::KubernetesServiceAccountExchange => { + let path = required_path_env(sandbox_env::K8S_SA_TOKEN_FILE, auth_mode)?; + Ok(AcquiredToken { + token: acquire_k8s_sandbox_token(endpoint, plain_channel, &path).await?, + source: TokenSource::K8sServiceAccount { path }, + }) + } } +} - if let Ok(path) = std::env::var(sandbox_env::SANDBOX_TOKEN_FILE) - && !path.is_empty() - { - let contents = std::fs::read_to_string(&path) - .into_diagnostic() - .wrap_err_with(|| format!("failed to read sandbox token from {path}"))?; - debug!(source = "file", path = %path, "loaded sandbox token"); - return Ok(AcquiredToken { - token: contents.trim().to_string(), - source: TokenSource::File, - }); - } +fn sandbox_auth_mode() -> Result { + let value = std::env::var(sandbox_env::SANDBOX_AUTH_MODE) + .into_diagnostic() + .wrap_err_with(|| { + format!( + "{} is required (expected one of: {})", + sandbox_env::SANDBOX_AUTH_MODE, + SandboxAuthMode::allowed_values() + ) + })?; + value + .parse::() + .map_err(|err| miette::miette!("{err}")) +} - if let Ok(sa_path) = std::env::var(sandbox_env::K8S_SA_TOKEN_FILE) - && !sa_path.is_empty() - { - return Ok(AcquiredToken { - token: acquire_k8s_sandbox_token(endpoint, plain_channel, &sa_path).await?, - source: TokenSource::K8sServiceAccount, - }); - } +fn required_env(name: &str, auth_mode: SandboxAuthMode) -> Result { + std::env::var(name) + .ok() + .filter(|value| !value.is_empty()) + .ok_or_else(|| { + miette::miette!( + "{} is required when {}={}", + name, + sandbox_env::SANDBOX_AUTH_MODE, + auth_mode.as_str() + ) + }) +} - Err(miette::miette!( - "no sandbox token source available — set one of {}, {}, or {}", - sandbox_env::SANDBOX_TOKEN, - sandbox_env::SANDBOX_TOKEN_FILE, - sandbox_env::K8S_SA_TOKEN_FILE, - )) +fn required_path_env(name: &str, auth_mode: SandboxAuthMode) -> Result { + required_env(name, auth_mode).map(PathBuf::from) } async fn acquire_k8s_sandbox_token( endpoint: &str, plain_channel: &Channel, - sa_path: &str, + sa_path: &Path, ) -> Result { let sa_token = std::fs::read_to_string(sa_path) .into_diagnostic() - .wrap_err_with(|| format!("failed to read K8s SA token from {sa_path}"))? + .wrap_err_with(|| format!("failed to read K8s SA token from {}", sa_path.display()))? .trim() .to_string(); info!(endpoint = %endpoint, "exchanging K8s ServiceAccount token for sandbox JWT"); - // The bootstrap exchange uses a one-off interceptor pinned to the + // The ServiceAccount exchange uses a one-off interceptor pinned to the // SA token; the resulting gateway JWT becomes the value in the // shared `TOKEN_SLOT` once `connect_channel` returns. let bootstrap_slot: TokenSlot = Arc::new(RwLock::new( @@ -286,14 +374,14 @@ async fn acquire_k8s_sandbox_token( .into_diagnostic() .wrap_err("SA token contained characters not valid for a header value")?, )); - let interceptor = AuthInterceptor::new(bootstrap_slot); + let interceptor = AuthInterceptor::new(bootstrap_slot, TokenSource::StaticToken); let bootstrap = InterceptedService::new(plain_channel.clone(), interceptor); let mut client = OpenShellClient::new(bootstrap); let resp = client .issue_sandbox_token(IssueSandboxTokenRequest {}) .await .into_diagnostic() - .wrap_err("IssueSandboxToken bootstrap exchange failed")?; + .wrap_err("IssueSandboxToken service-account exchange failed")?; Ok(resp.into_inner().token) } @@ -303,81 +391,36 @@ pub async fn connect_channel_pub(endpoint: &str) -> Result { connect_channel(endpoint).await } -/// Background task that renews the sandbox JWT at ~80% of its remaining -/// lifetime. The new token replaces the value in [`TOKEN_SLOT`], so all -/// in-flight and future clients pick it up on their next request. The -/// loop never panics: every failure is logged and re-attempted after a -/// bounded backoff. -async fn refresh_token_loop( - channel: AuthedChannel, +/// Background task that re-exchanges the Kubernetes `ServiceAccount` JWT at +/// ~80% of the gateway JWT's remaining lifetime. The new gateway token +/// replaces the value in [`TOKEN_SLOT`], so all in-flight and future clients +/// pick it up on their next request. The loop never panics: every failure is +/// logged and retried after a bounded backoff. +async fn refresh_k8s_exchange_loop( slot: TokenSlot, - source: TokenSource, endpoint: String, plain_channel: Channel, + sa_path: PathBuf, ) { - let mut client = OpenShellClient::new(channel); loop { let sleep = compute_refresh_delay(&slot); tokio::time::sleep(sleep).await; - match client - .refresh_sandbox_token(RefreshSandboxTokenRequest {}) - .await - { - Ok(resp) => { - let new_token = resp.into_inner().token; - match AsciiMetadataValue::try_from(format!("Bearer {new_token}")) { - Ok(value) => { - if let Ok(mut guard) = slot.write() { - *guard = value; - info!("renewed gateway sandbox JWT in-place"); - } + match acquire_k8s_sandbox_token(&endpoint, &plain_channel, &sa_path).await { + Ok(new_token) => match AsciiMetadataValue::try_from(format!("Bearer {new_token}")) { + Ok(value) => { + if let Ok(mut guard) = slot.write() { + *guard = value; + info!("re-exchanged Kubernetes ServiceAccount token for sandbox JWT"); } - Err(e) => warn!(error = %e, "refreshed JWT contained invalid header bytes"), } - } - Err(status) => { - if status.code() == tonic::Code::Unauthenticated - && source == TokenSource::K8sServiceAccount - { - if let Some(sa_path) = std::env::var(sandbox_env::K8S_SA_TOKEN_FILE) - .ok() - .filter(|p| !p.is_empty()) - { - match acquire_k8s_sandbox_token(&endpoint, &plain_channel, &sa_path).await { - Ok(new_token) => { - match AsciiMetadataValue::try_from(format!("Bearer {new_token}")) { - Ok(value) => { - if let Ok(mut guard) = slot.write() { - *guard = value; - info!( - "rebootstrapped gateway sandbox JWT after refresh authentication failure" - ); - continue; - } - } - Err(e) => warn!( - error = %e, - "rebootstrapped JWT contained invalid header bytes" - ), - } - } - Err(e) => warn!( - error = %e, - "K8s ServiceAccount bootstrap retry failed after refresh authentication failure" - ), - } - } else { - warn!( - "RefreshSandboxToken returned Unauthenticated and K8s SA token file is unavailable" - ); - } - } else if status.code() == tonic::Code::Unauthenticated { - warn!( - source = ?source, - "RefreshSandboxToken returned Unauthenticated; static token sources cannot rebootstrap automatically" - ); - } - warn!(error = %status, "RefreshSandboxToken failed; will retry"); + Err(e) => warn!(error = %e, "refreshed JWT contained invalid header bytes"), + }, + Err(err) => { + warn!( + path = %sa_path.display(), + error = %err, + "Kubernetes ServiceAccount token exchange failed; will retry" + ); // Backoff so we don't spin against a sustained failure. tokio::time::sleep(Duration::from_secs(10)).await; } @@ -385,6 +428,30 @@ async fn refresh_token_loop( } } +pub fn supervisor_pushed_token_file_path() -> Option { + if sandbox_auth_mode().ok() != Some(SandboxAuthMode::GatewayManagedSupervisorPush) { + return None; + } + std::env::var(sandbox_env::SANDBOX_TOKEN_FILE) + .ok() + .filter(|path| !path.is_empty()) + .map(PathBuf::from) +} + +fn read_sandbox_token_file(path: &Path) -> Result { + let contents = std::fs::read_to_string(path) + .into_diagnostic() + .wrap_err_with(|| format!("failed to read sandbox token from {}", path.display()))?; + let token = contents.trim().to_string(); + if token.is_empty() { + return Err(miette::miette!( + "sandbox token file {} was empty", + path.display() + )); + } + Ok(token) +} + /// Compute the next refresh delay: 80 % of the time remaining until the /// current token's `exp`, plus up to 10 % jitter, with a small lower bound /// for already-expired tokens and capped at 12 h. If the token can't be parsed @@ -512,6 +579,40 @@ mod auth_tests { "expected refresh before 30s expiry, got {delay:?}", ); } + + #[test] + fn file_token_interceptor_reads_rotated_token_from_disk() { + let dir = tempfile::tempdir().unwrap(); + let path = dir.path().join("sandbox.jwt"); + std::fs::write(&path, "old-token\n").unwrap(); + let slot: TokenSlot = Arc::new(RwLock::new(bearer_value("old-token").unwrap())); + let mut interceptor = AuthInterceptor::new( + slot.clone(), + TokenSource::GatewayManagedFile { path: path.clone() }, + ); + + let req = + tonic::service::Interceptor::call(&mut interceptor, tonic::Request::new(())).unwrap(); + assert_eq!( + req.metadata() + .get("authorization") + .and_then(|value| value.to_str().ok()), + Some("Bearer old-token") + ); + + std::fs::write(&path, "new-token\n").unwrap(); + + let req = + tonic::service::Interceptor::call(&mut interceptor, tonic::Request::new(())).unwrap(); + assert_eq!( + req.metadata() + .get("authorization") + .and_then(|value| value.to_str().ok()), + Some("Bearer new-token") + ); + let guard = slot.read().unwrap(); + assert_eq!(guard.to_str().ok(), Some("Bearer new-token")); + } } /// Connect to the `OpenShell` server. diff --git a/crates/openshell-sandbox/src/process.rs b/crates/openshell-sandbox/src/process.rs index c770526be..c0f2be6f1 100644 --- a/crates/openshell-sandbox/src/process.rs +++ b/crates/openshell-sandbox/src/process.rs @@ -219,6 +219,7 @@ impl ProcessHandle { // (e.g. an SSH-spawned shell) could read /proc//environ // and recover the gateway-minted JWT. Issue #1354. cmd.env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN) + .env_remove(openshell_core::sandbox_env::SANDBOX_AUTH_MODE) .env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) .env_remove(openshell_core::sandbox_env::K8S_SA_TOKEN_FILE); @@ -354,6 +355,7 @@ impl ProcessHandle { // (e.g. an SSH-spawned shell) could read /proc//environ // and recover the gateway-minted JWT. Issue #1354. cmd.env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN) + .env_remove(openshell_core::sandbox_env::SANDBOX_AUTH_MODE) .env_remove(openshell_core::sandbox_env::SANDBOX_TOKEN_FILE) .env_remove(openshell_core::sandbox_env::K8S_SA_TOKEN_FILE); diff --git a/crates/openshell-sandbox/src/supervisor_session.rs b/crates/openshell-sandbox/src/supervisor_session.rs index 4d7392ee3..f3234bd61 100644 --- a/crates/openshell-sandbox/src/supervisor_session.rs +++ b/crates/openshell-sandbox/src/supervisor_session.rs @@ -419,12 +419,50 @@ fn handle_gateway_message( relay_close_from_gateway_event(crate::ocsf_ctx(), &close.channel_id, &close.reason); ocsf_emit!(event); } + Some(gateway_message::Payload::SandboxTokenUpdate(update)) => { + persist_gateway_managed_sandbox_token(sandbox_id, update); + } _ => { warn!(sandbox_id = %sandbox_id, "supervisor session: unexpected gateway message"); } } } +fn persist_gateway_managed_sandbox_token( + sandbox_id: &str, + update: &openshell_core::proto::SandboxTokenUpdate, +) { + if update.token.is_empty() { + warn!(sandbox_id = %sandbox_id, "supervisor session: ignoring empty sandbox token update"); + return; + } + let Some(path) = grpc_client::supervisor_pushed_token_file_path() else { + warn!( + sandbox_id = %sandbox_id, + "supervisor session: received sandbox token update without supervisor-push token mode" + ); + return; + }; + let contents = format!("{}\n", update.token); + if let Err(err) = + openshell_core::paths::write_file_owner_only_atomic(&path, contents.as_bytes()) + { + warn!( + sandbox_id = %sandbox_id, + path = %path.display(), + error = %err, + "supervisor session: failed to persist sandbox token update" + ); + return; + } + debug!( + sandbox_id = %sandbox_id, + path = %path.display(), + expires_at_ms = update.expires_at_ms, + "supervisor session: persisted gateway-managed sandbox token update" + ); +} + /// Handle a `RelayOpen` by initiating a `RelayStream` RPC on the gateway and /// bridging that stream to the local SSH daemon. /// diff --git a/crates/openshell-server/src/compute/mod.rs b/crates/openshell-server/src/compute/mod.rs index 98dc3fd63..f01b9c835 100644 --- a/crates/openshell-server/src/compute/mod.rs +++ b/crates/openshell-server/src/compute/mod.rs @@ -19,9 +19,9 @@ use openshell_core::proto::compute::v1::{ CreateSandboxRequest, DeleteSandboxRequest, DriverCondition, DriverPlatformEvent, DriverResourceRequirements, DriverSandbox, DriverSandboxSpec, DriverSandboxStatus, DriverSandboxTemplate, GetCapabilitiesRequest, GetSandboxRequest, ListSandboxesRequest, - ValidateSandboxCreateRequest, WatchSandboxesEvent, WatchSandboxesRequest, - compute_driver_client::ComputeDriverClient, compute_driver_server::ComputeDriver, - watch_sandboxes_event, + ResumeSandboxRequest, StopSandboxRequest, ValidateSandboxCreateRequest, WatchSandboxesEvent, + WatchSandboxesRequest, WriteSandboxTokenRequest, compute_driver_client::ComputeDriverClient, + compute_driver_server::ComputeDriver, watch_sandboxes_event, }; use openshell_core::proto::{ PlatformEvent, Sandbox, SandboxCondition, SandboxPhase, SandboxSpec, SandboxStatus, @@ -70,23 +70,85 @@ impl ShutdownCleanup for DockerComputeDriver { } /// Resume a single sandbox whose store record indicates it should be -/// running. Implemented by drivers (currently only Docker) where compute +/// running. Implemented by local drivers where compute /// resources do not auto-restart with the gateway. Returns `Ok(true)` if /// the backend resource was found and resumed (or was already running), /// `Ok(false)` if no backend resource exists. #[tonic::async_trait] trait StartupResume: Send + Sync { - async fn resume_sandbox(&self, sandbox_id: &str, sandbox_name: &str) -> Result; + async fn resume_sandbox( + &self, + sandbox_id: &str, + sandbox_name: &str, + sandbox_token: Option, + ) -> Result; +} + +/// Writes gateway-minted sandbox JWTs into driver-owned token state. +/// +/// Local single-player drivers use this for startup reissue and periodic +/// rotation. Docker and Podman write host files mounted into containers; VM +/// updates its persisted sandbox request and the connected supervisor receives +/// the same token over the control stream. +#[tonic::async_trait] +trait SandboxTokenWriter: Send + Sync { + async fn write_sandbox_token( + &self, + sandbox_id: &str, + sandbox_token: String, + ) -> Result<(), String>; } #[tonic::async_trait] impl StartupResume for DockerComputeDriver { - async fn resume_sandbox(&self, sandbox_id: &str, sandbox_name: &str) -> Result { - Self::resume_sandbox(self, sandbox_id, sandbox_name) + async fn resume_sandbox( + &self, + sandbox_id: &str, + sandbox_name: &str, + sandbox_token: Option, + ) -> Result { + Self::resume_sandbox(self, sandbox_id, sandbox_name, sandbox_token.as_deref()) .await .map_err(|err| err.to_string()) } } + +#[tonic::async_trait] +impl StartupResume for PodmanComputeDriver { + async fn resume_sandbox( + &self, + sandbox_id: &str, + sandbox_name: &str, + sandbox_token: Option, + ) -> Result { + Self::resume_sandbox(self, sandbox_id, sandbox_name, sandbox_token.as_deref()) + .await + .map_err(|err| err.to_string()) + } +} + +#[tonic::async_trait] +impl SandboxTokenWriter for DockerComputeDriver { + async fn write_sandbox_token( + &self, + sandbox_id: &str, + sandbox_token: String, + ) -> Result<(), String> { + Self::write_sandbox_token(self, sandbox_id, &sandbox_token).map_err(|err| err.to_string()) + } +} + +#[tonic::async_trait] +impl SandboxTokenWriter for PodmanComputeDriver { + async fn write_sandbox_token( + &self, + sandbox_id: &str, + sandbox_token: String, + ) -> Result<(), String> { + Self::write_sandbox_token(self, sandbox_id, &sandbox_token).map_err(|err| err.to_string()) + } +} + /// Interval between store-vs-backend reconciliation sweeps. const RECONCILE_INTERVAL: Duration = Duration::from_secs(60); @@ -187,9 +249,29 @@ impl ComputeDriver for RemoteComputeDriver { client.create_sandbox(request).await } + async fn resume_sandbox( + &self, + request: Request, + ) -> Result, Status> + { + let mut client = self.client(); + client.resume_sandbox(request).await + } + + async fn write_sandbox_token( + &self, + request: Request, + ) -> Result< + tonic::Response, + Status, + > { + let mut client = self.client(); + client.write_sandbox_token(request).await + } + async fn stop_sandbox( &self, - request: Request, + request: Request, ) -> Result, Status> { let mut client = self.client(); @@ -216,11 +298,52 @@ impl ComputeDriver for RemoteComputeDriver { } } +#[tonic::async_trait] +impl StartupResume for RemoteComputeDriver { + async fn resume_sandbox( + &self, + sandbox_id: &str, + sandbox_name: &str, + sandbox_token: Option, + ) -> Result { + let mut client = self.client(); + let response = client + .resume_sandbox(Request::new(ResumeSandboxRequest { + sandbox_id: sandbox_id.to_string(), + sandbox_name: sandbox_name.to_string(), + sandbox_token: sandbox_token.unwrap_or_default(), + })) + .await + .map_err(|status| status.message().to_string())?; + Ok(response.into_inner().resumed) + } +} + +#[tonic::async_trait] +impl SandboxTokenWriter for RemoteComputeDriver { + async fn write_sandbox_token( + &self, + sandbox_id: &str, + sandbox_token: String, + ) -> Result<(), String> { + let mut client = self.client(); + client + .write_sandbox_token(Request::new(WriteSandboxTokenRequest { + sandbox_id: sandbox_id.to_string(), + sandbox_token, + })) + .await + .map_err(|status| status.message().to_string())?; + Ok(()) + } +} + #[derive(Clone)] pub struct ComputeRuntime { driver: SharedComputeDriver, shutdown_cleanup: Option>, startup_resume: Option>, + sandbox_token_writer: Option>, _driver_process: Option>, default_image: String, store: Arc, @@ -228,6 +351,7 @@ pub struct ComputeRuntime { sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, supervisor_sessions: Arc, + push_token_updates_to_supervisors: bool, sync_lock: Arc>, gateway_bind_addresses: Vec, } @@ -244,12 +368,14 @@ impl ComputeRuntime { driver: SharedComputeDriver, shutdown_cleanup: Option>, startup_resume: Option>, + sandbox_token_writer: Option>, driver_process: Option>, store: Arc, sandbox_index: SandboxIndex, sandbox_watch_bus: SandboxWatchBus, tracing_log_bus: TracingLogBus, supervisor_sessions: Arc, + push_token_updates_to_supervisors: bool, _allows_loopback_endpoints: bool, gateway_bind_addresses: Vec, ) -> Result { @@ -263,6 +389,7 @@ impl ComputeRuntime { driver, shutdown_cleanup, startup_resume, + sandbox_token_writer, _driver_process: driver_process, default_image, store, @@ -270,6 +397,7 @@ impl ComputeRuntime { sandbox_watch_bus, tracing_log_bus, supervisor_sessions, + push_token_updates_to_supervisors, sync_lock: Arc::new(Mutex::new(())), gateway_bind_addresses, }) @@ -302,17 +430,20 @@ impl ComputeRuntime { let gateway_bind_addresses = driver.gateway_bind_addresses(); let shutdown_cleanup: Arc = driver.clone(); let startup_resume: Arc = driver.clone(); + let sandbox_token_writer: Arc = driver.clone(); let driver: SharedComputeDriver = driver; Self::from_driver( driver, Some(shutdown_cleanup), Some(startup_resume), + Some(sandbox_token_writer), None, store, sandbox_index, sandbox_watch_bus, tracing_log_bus, supervisor_sessions, + false, true, gateway_bind_addresses, ) @@ -336,12 +467,14 @@ impl ComputeRuntime { None, None, None, + None, store, sandbox_index, sandbox_watch_bus, tracing_log_bus, supervisor_sessions, false, + false, Vec::new(), ) .await @@ -356,11 +489,13 @@ impl ComputeRuntime { tracing_log_bus: TracingLogBus, supervisor_sessions: Arc, ) -> Result { - let driver: SharedComputeDriver = Arc::new(RemoteComputeDriver::new(channel)); + let remote_driver = Arc::new(RemoteComputeDriver::new(channel)); + let driver: SharedComputeDriver = remote_driver.clone(); Self::from_driver( driver, None, - None, + Some(remote_driver.clone()), + Some(remote_driver), driver_process, store, sandbox_index, @@ -368,6 +503,7 @@ impl ComputeRuntime { tracing_log_bus, supervisor_sessions, true, + true, Vec::new(), ) .await @@ -381,20 +517,27 @@ impl ComputeRuntime { tracing_log_bus: TracingLogBus, supervisor_sessions: Arc, ) -> Result { - let driver = PodmanComputeDriver::new(config) - .await - .map_err(|err| ComputeError::Message(err.to_string()))?; - let driver: SharedComputeDriver = Arc::new(PodmanDriverService::new(driver)); + let podman_driver = Arc::new( + PodmanComputeDriver::new(config) + .await + .map_err(|err| ComputeError::Message(err.to_string()))?, + ); + let startup_resume: Arc = podman_driver.clone(); + let sandbox_token_writer: Arc = podman_driver.clone(); + let driver: SharedComputeDriver = + Arc::new(PodmanDriverService::new((*podman_driver).clone())); Self::from_driver( driver, None, - None, + Some(startup_resume), + Some(sandbox_token_writer), None, store, sandbox_index, sandbox_watch_bus, tracing_log_bus, supervisor_sessions, + false, true, Vec::new(), ) @@ -575,7 +718,7 @@ impl ComputeRuntime { /// Resume sandboxes whose store records say they should be running. /// Drivers that do not auto-restart compute resources across gateway - /// restarts (currently only Docker) implement `StartupResume`. For + /// restarts implement `StartupResume`. For /// each sandbox in the store whose phase is not `Deleting` or /// `Error`, we ask the driver to resume the underlying resource. If /// the driver reports that the resource no longer exists or fails to @@ -584,7 +727,10 @@ impl ComputeRuntime { /// /// Should be called once at gateway startup, before watchers spawn, /// so the watch loop sees the post-resume state on its first poll. - pub async fn resume_persisted_sandboxes(&self) -> Result<(), String> { + pub async fn resume_persisted_sandboxes( + &self, + sandbox_jwt_issuer: Option<&crate::auth::sandbox_jwt::SandboxJwtIssuer>, + ) -> Result<(), String> { let Some(resume) = &self.startup_resume else { return Ok(()); }; @@ -613,8 +759,18 @@ impl ComputeRuntime { continue; } + let sandbox_token = match sandbox_jwt_issuer { + Some(issuer) => Some( + issuer + .mint(sandbox.object_id()) + .map_err(|status| status.message().to_string())? + .token, + ), + None => None, + }; + match resume - .resume_sandbox(sandbox.object_id(), sandbox.object_name()) + .resume_sandbox(sandbox.object_id(), sandbox.object_name(), sandbox_token) .await { Ok(true) => { @@ -674,6 +830,136 @@ impl ComputeRuntime { Ok(()) } + /// Reissue gateway-managed sandbox JWTs into local driver token state. + /// + /// Docker and Podman rewrite token files mounted into containers. VM + /// updates persisted driver state and receives live token pushes over the + /// supervisor control stream. Startup and periodic refresh let supervisors + /// recover from gateway restarts without self-refreshing their own tokens. + pub async fn refresh_gateway_managed_sandbox_tokens( + &self, + sandbox_jwt_issuer: &crate::auth::sandbox_jwt::SandboxJwtIssuer, + ) -> Result<(usize, usize), String> { + let Some(writer) = &self.sandbox_token_writer else { + return Ok((0, 0)); + }; + + let records = self + .store + .list(Sandbox::object_type(), 1000, 0) + .await + .map_err(|e| e.to_string())?; + + let mut refreshed = 0usize; + let mut failed = 0usize; + + for record in records { + let sandbox = match Sandbox::decode(record.payload.as_slice()) { + Ok(sandbox) => sandbox, + Err(err) => { + warn!(error = %err, "Failed to decode sandbox record during token refresh"); + continue; + } + }; + + let phase = SandboxPhase::try_from(sandbox.phase).unwrap_or(SandboxPhase::Unknown); + if !sandbox_phase_should_be_running(phase) { + continue; + } + + let minted = match sandbox_jwt_issuer.mint(sandbox.object_id()) { + Ok(minted) => minted, + Err(status) => { + failed += 1; + warn!( + sandbox_id = %sandbox.object_id(), + sandbox_name = %sandbox.object_name(), + error = %status, + "Failed to mint replacement sandbox JWT" + ); + continue; + } + }; + let sandbox_token = minted.token.clone(); + + match writer + .write_sandbox_token(sandbox.object_id(), sandbox_token) + .await + { + Ok(()) => { + refreshed += 1; + if self.push_token_updates_to_supervisors + && let Err(err) = self + .supervisor_sessions + .send_sandbox_token_update( + sandbox.object_id(), + minted.token, + minted.expires_at_ms, + ) + .await + { + warn!( + sandbox_id = %sandbox.object_id(), + sandbox_name = %sandbox.object_name(), + error = %err, + "Failed to deliver replacement sandbox JWT to connected supervisor" + ); + } + } + Err(err) => { + failed += 1; + warn!( + sandbox_id = %sandbox.object_id(), + sandbox_name = %sandbox.object_name(), + error = %err, + "Failed to write replacement sandbox JWT" + ); + } + } + } + + if refreshed > 0 || failed > 0 { + info!( + refreshed, + failed, "Gateway-managed sandbox token refresh sweep complete" + ); + } + + Ok((refreshed, failed)) + } + + pub fn spawn_gateway_managed_sandbox_token_rotator( + &self, + sandbox_jwt_issuer: Arc, + ) { + if self.sandbox_token_writer.is_none() { + return; + } + + let runtime = Arc::new(self.clone()); + let interval = sandbox_token_rotation_interval(sandbox_jwt_issuer.ttl()); + tokio::spawn(async move { + loop { + tokio::time::sleep(interval).await; + match runtime + .refresh_gateway_managed_sandbox_tokens(&sandbox_jwt_issuer) + .await + { + Ok((_refreshed, failed)) if failed > 0 => { + warn!(failed, "Some gateway-managed sandbox JWTs failed to rotate"); + } + Ok(_) => {} + Err(err) => { + warn!( + error = %err, + "Gateway-managed sandbox token rotation sweep failed" + ); + } + } + } + }); + } + async fn mark_sandbox_error(&self, sandbox: &Sandbox, reason: &str, message: &str) { let _guard = self.sync_lock.lock().await; let sandbox_id = sandbox.object_id().to_string(); @@ -1617,6 +1903,25 @@ fn sandbox_phase_should_be_running(phase: SandboxPhase) -> bool { ) } +fn sandbox_token_rotation_interval(ttl: Duration) -> Duration { + let ttl_ms = u64::try_from(ttl.as_millis()).unwrap_or(u64::MAX); + if ttl_ms <= 1 { + return Duration::from_millis(1); + } + + let max_ms = u64::try_from(Duration::from_secs(12 * 60 * 60).as_millis()).unwrap_or(u64::MAX); + let before_expiry_ms = ttl_ms.saturating_sub(1).max(1); + let rotate_ms = ttl_ms + .saturating_mul(4) + .checked_div(5) + .unwrap_or(1) + .max(250) + .min(max_ms) + .min(before_expiry_ms) + .max(1); + Duration::from_millis(rotate_ms) +} + fn is_terminal_failure_reason(reason: &str) -> bool { let reason = reason.to_ascii_lowercase(); let transient_reasons = [ @@ -1698,9 +2003,31 @@ impl ComputeDriver for NoopTestDriver { )) } + async fn resume_sandbox( + &self, + _request: Request, + ) -> Result, Status> + { + Ok(tonic::Response::new( + openshell_core::proto::compute::v1::ResumeSandboxResponse { resumed: false }, + )) + } + + async fn write_sandbox_token( + &self, + _request: Request, + ) -> Result< + tonic::Response, + Status, + > { + Ok(tonic::Response::new( + openshell_core::proto::compute::v1::WriteSandboxTokenResponse {}, + )) + } + async fn stop_sandbox( &self, - _request: Request, + _request: Request, ) -> Result, Status> { Ok(tonic::Response::new( @@ -1732,6 +2059,7 @@ pub async fn new_test_runtime(store: Arc) -> ComputeRuntime { driver: Arc::new(NoopTestDriver), shutdown_cleanup: None, startup_resume: None, + sandbox_token_writer: None, _driver_process: None, default_image: "openshell/sandbox:test".to_string(), store, @@ -1739,6 +2067,7 @@ pub async fn new_test_runtime(store: Arc) -> ComputeRuntime { sandbox_watch_bus: SandboxWatchBus::new(), tracing_log_bus: TracingLogBus::new(), supervisor_sessions: Arc::new(SupervisorSessionRegistry::new()), + push_token_updates_to_supervisors: false, sync_lock: Arc::new(Mutex::new(())), gateway_bind_addresses: Vec::new(), } @@ -1748,9 +2077,11 @@ pub async fn new_test_runtime(store: Arc) -> ComputeRuntime { mod tests { use super::*; use futures::stream; + use openshell_bootstrap::jwt::generate_jwt_key; use openshell_core::proto::compute::v1::{ CreateSandboxResponse, DeleteSandboxResponse, GetCapabilitiesResponse, GetSandboxRequest, - GetSandboxResponse, StopSandboxRequest, StopSandboxResponse, ValidateSandboxCreateResponse, + GetSandboxResponse, ResumeSandboxResponse, StopSandboxRequest, StopSandboxResponse, + ValidateSandboxCreateResponse, WriteSandboxTokenResponse, }; use std::collections::HashMap; use std::sync::Arc; @@ -1862,6 +2193,22 @@ mod tests { Ok(tonic::Response::new(CreateSandboxResponse {})) } + async fn resume_sandbox( + &self, + _request: Request, + ) -> Result, Status> { + Ok(tonic::Response::new(ResumeSandboxResponse { + resumed: false, + })) + } + + async fn write_sandbox_token( + &self, + _request: Request, + ) -> Result, Status> { + Ok(tonic::Response::new(WriteSandboxTokenResponse {})) + } + async fn stop_sandbox( &self, _request: Request, @@ -1887,18 +2234,27 @@ mod tests { } async fn test_runtime(driver: SharedComputeDriver) -> ComputeRuntime { - test_runtime_with_resume(driver, None).await + test_runtime_with_hooks(driver, None, None).await } async fn test_runtime_with_resume( driver: SharedComputeDriver, startup_resume: Option>, + ) -> ComputeRuntime { + test_runtime_with_hooks(driver, startup_resume, None).await + } + + async fn test_runtime_with_hooks( + driver: SharedComputeDriver, + startup_resume: Option>, + sandbox_token_writer: Option>, ) -> ComputeRuntime { let store = Arc::new(Store::connect("sqlite::memory:").await.unwrap()); ComputeRuntime { driver, shutdown_cleanup: None, startup_resume, + sandbox_token_writer, _driver_process: None, default_image: "openshell/sandbox:test".to_string(), store, @@ -1906,6 +2262,7 @@ mod tests { sandbox_watch_bus: SandboxWatchBus::new(), tracing_log_bus: TracingLogBus::new(), supervisor_sessions: Arc::new(SupervisorSessionRegistry::new()), + push_token_updates_to_supervisors: false, sync_lock: Arc::new(Mutex::new(())), gateway_bind_addresses: Vec::new(), } @@ -2669,7 +3026,7 @@ mod tests { #[derive(Default)] struct RecordingResume { - calls: Mutex>, + calls: Mutex)>>, results: Mutex>>, } @@ -2681,7 +3038,7 @@ mod tests { .insert(sandbox_id.to_string(), result); } - async fn calls(&self) -> Vec<(String, String)> { + async fn calls(&self) -> Vec<(String, String, Option)> { self.calls.lock().await.clone() } } @@ -2692,17 +3049,58 @@ mod tests { &self, sandbox_id: &str, sandbox_name: &str, + sandbox_token: Option, ) -> Result { + self.calls.lock().await.push(( + sandbox_id.to_string(), + sandbox_name.to_string(), + sandbox_token, + )); + self.results + .lock() + .await + .get(sandbox_id) + .cloned() + .unwrap_or(Ok(true)) + } + } + + #[derive(Default)] + struct RecordingTokenWriter { + calls: Mutex>, + results: Mutex>>, + } + + impl RecordingTokenWriter { + async fn set_result(&self, sandbox_id: &str, result: Result<(), String>) { + self.results + .lock() + .await + .insert(sandbox_id.to_string(), result); + } + + async fn calls(&self) -> Vec<(String, String)> { + self.calls.lock().await.clone() + } + } + + #[tonic::async_trait] + impl SandboxTokenWriter for RecordingTokenWriter { + async fn write_sandbox_token( + &self, + sandbox_id: &str, + sandbox_token: String, + ) -> Result<(), String> { self.calls .lock() .await - .push((sandbox_id.to_string(), sandbox_name.to_string())); + .push((sandbox_id.to_string(), sandbox_token)); self.results .lock() .await .get(sandbox_id) .cloned() - .unwrap_or(Ok(true)) + .unwrap_or(Ok(())) } } @@ -2723,13 +3121,13 @@ mod tests { runtime.store.put_message(&sandbox).await.unwrap(); } - runtime.resume_persisted_sandboxes().await.unwrap(); + runtime.resume_persisted_sandboxes(None).await.unwrap(); let mut called_ids = resume .calls() .await .into_iter() - .map(|(id, _)| id) + .map(|(id, _, _)| id) .collect::>(); called_ids.sort(); assert_eq!( @@ -2742,6 +3140,35 @@ mod tests { ); } + #[tokio::test] + async fn resume_persisted_sandboxes_mints_tokens_for_startup_resume() { + let resume = Arc::new(RecordingResume::default()); + let runtime = + test_runtime_with_resume(Arc::new(TestDriver::default()), Some(resume.clone())).await; + let sandbox = sandbox_record("sb-1", "sandbox-a", SandboxPhase::Ready); + runtime.store.put_message(&sandbox).await.unwrap(); + let mat = generate_jwt_key().expect("jwt key"); + let issuer = crate::auth::sandbox_jwt::SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid, + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + + runtime + .resume_persisted_sandboxes(Some(&issuer)) + .await + .unwrap(); + + let calls = resume.calls().await; + assert_eq!(calls.len(), 1); + assert_eq!(calls[0].0, "sb-1"); + assert_eq!(calls[0].1, "sandbox-a"); + let token = calls[0].2.as_deref().expect("startup resume token"); + assert_eq!(token.split('.').count(), 3); + } + #[tokio::test] async fn resume_persisted_sandboxes_marks_missing_backend_as_error() { let resume = Arc::new(RecordingResume::default()); @@ -2752,7 +3179,7 @@ mod tests { let sandbox = sandbox_record("sb-1", "missing", SandboxPhase::Ready); runtime.store.put_message(&sandbox).await.unwrap(); - runtime.resume_persisted_sandboxes().await.unwrap(); + runtime.resume_persisted_sandboxes(None).await.unwrap(); let stored = runtime .store @@ -2784,7 +3211,7 @@ mod tests { let sandbox = sandbox_record("sb-1", "broken", SandboxPhase::Provisioning); runtime.store.put_message(&sandbox).await.unwrap(); - runtime.resume_persisted_sandboxes().await.unwrap(); + runtime.resume_persisted_sandboxes(None).await.unwrap(); let stored = runtime .store @@ -2811,7 +3238,7 @@ mod tests { let sandbox = sandbox_record("sb-1", "anywhere", SandboxPhase::Ready); runtime.store.put_message(&sandbox).await.unwrap(); - runtime.resume_persisted_sandboxes().await.unwrap(); + runtime.resume_persisted_sandboxes(None).await.unwrap(); let stored = runtime .store @@ -2825,6 +3252,122 @@ mod tests { ); } + #[tokio::test] + async fn refresh_gateway_managed_sandbox_tokens_writes_running_phase_tokens() { + let writer = Arc::new(RecordingTokenWriter::default()); + let runtime = + test_runtime_with_hooks(Arc::new(TestDriver::default()), None, Some(writer.clone())) + .await; + + for (id, name, phase) in [ + ("sb-prov", "prov", SandboxPhase::Provisioning), + ("sb-ready", "ready", SandboxPhase::Ready), + ("sb-unknown", "unknown", SandboxPhase::Unknown), + ("sb-deleting", "deleting", SandboxPhase::Deleting), + ("sb-error", "error", SandboxPhase::Error), + ] { + let sandbox = sandbox_record(id, name, phase); + runtime.store.put_message(&sandbox).await.unwrap(); + } + + let mat = generate_jwt_key().expect("jwt key"); + let issuer = crate::auth::sandbox_jwt::SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid, + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + + let (refreshed, failed) = runtime + .refresh_gateway_managed_sandbox_tokens(&issuer) + .await + .unwrap(); + + assert_eq!(refreshed, 3); + assert_eq!(failed, 0); + let mut calls = writer.calls().await; + calls.sort_by(|left, right| left.0.cmp(&right.0)); + assert_eq!( + calls + .iter() + .map(|(sandbox_id, _)| sandbox_id.as_str()) + .collect::>(), + vec!["sb-prov", "sb-ready", "sb-unknown"] + ); + for (_sandbox_id, token) in calls { + assert_eq!(token.split('.').count(), 3); + } + } + + #[tokio::test] + async fn refresh_gateway_managed_sandbox_tokens_counts_write_failures() { + let writer = Arc::new(RecordingTokenWriter::default()); + writer + .set_result("sb-1", Err("disk is read-only".to_string())) + .await; + let runtime = + test_runtime_with_hooks(Arc::new(TestDriver::default()), None, Some(writer.clone())) + .await; + + let sandbox = sandbox_record("sb-1", "ready", SandboxPhase::Ready); + runtime.store.put_message(&sandbox).await.unwrap(); + + let mat = generate_jwt_key().expect("jwt key"); + let issuer = crate::auth::sandbox_jwt::SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid, + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + + let (refreshed, failed) = runtime + .refresh_gateway_managed_sandbox_tokens(&issuer) + .await + .unwrap(); + + assert_eq!(refreshed, 0); + assert_eq!(failed, 1); + assert_eq!(writer.calls().await.len(), 1); + } + + #[tokio::test] + async fn refresh_gateway_managed_sandbox_tokens_is_noop_without_writer() { + let runtime = test_runtime(Arc::new(TestDriver::default())).await; + let sandbox = sandbox_record("sb-1", "ready", SandboxPhase::Ready); + runtime.store.put_message(&sandbox).await.unwrap(); + + let mat = generate_jwt_key().expect("jwt key"); + let issuer = crate::auth::sandbox_jwt::SandboxJwtIssuer::from_pem( + mat.signing_key_pem.as_bytes(), + mat.kid, + "test-gateway", + Duration::from_secs(3600), + ) + .unwrap(); + + let (refreshed, failed) = runtime + .refresh_gateway_managed_sandbox_tokens(&issuer) + .await + .unwrap(); + + assert_eq!((refreshed, failed), (0, 0)); + } + + #[test] + fn sandbox_token_rotation_interval_rotates_before_expiry() { + assert_eq!( + sandbox_token_rotation_interval(Duration::from_secs(3)), + Duration::from_millis(2400) + ); + assert!(sandbox_token_rotation_interval(Duration::from_secs(1)) < Duration::from_secs(1)); + assert_eq!( + sandbox_token_rotation_interval(Duration::from_secs(24 * 60 * 60)), + Duration::from_secs(12 * 60 * 60) + ); + } + #[test] fn build_platform_config_inverts_user_namespaces_to_host_users() { use prost_types::value::Kind; diff --git a/crates/openshell-server/src/config_file.rs b/crates/openshell-server/src/config_file.rs index 7d7c99cc3..bd96812dd 100644 --- a/crates/openshell-server/src/config_file.rs +++ b/crates/openshell-server/src/config_file.rs @@ -118,8 +118,8 @@ pub struct GatewayFileSection { #[serde(default)] pub enable_user_namespaces: Option, /// Lifetime (seconds) of the projected `ServiceAccount` token kubelet - /// writes for the `IssueSandboxToken` bootstrap exchange. Driver - /// clamps to `[600, 86400]`. + /// writes for `IssueSandboxToken` exchanges. Driver clamps to + /// `[600, 86400]`. #[serde(default)] pub sa_token_ttl_secs: Option, #[serde(default)] diff --git a/crates/openshell-server/src/grpc/auth_rpc.rs b/crates/openshell-server/src/grpc/auth_rpc.rs index 8e98b1824..5035558c8 100644 --- a/crates/openshell-server/src/grpc/auth_rpc.rs +++ b/crates/openshell-server/src/grpc/auth_rpc.rs @@ -4,8 +4,8 @@ //! Authentication-related RPC handlers. //! //! Hosts the two sandbox-identity RPCs: -//! - `IssueSandboxToken` — bootstrap exchange (K8s SA token → gateway JWT) -//! - `RefreshSandboxToken` — renew a still-valid gateway JWT +//! - `IssueSandboxToken` - Kubernetes SA token exchange to gateway JWT +//! - `RefreshSandboxToken` - manual/direct renewal of a still-valid gateway JWT //! //! Both end in a fresh gateway-signed JWT minted by //! [`crate::auth::sandbox_jwt::SandboxJwtIssuer`]. Older tokens remain valid @@ -38,9 +38,9 @@ pub async fn handle_issue_sandbox_token( )); }; - // Only the bootstrap K8s ServiceAccount path can mint a fresh gateway JWT - // via this RPC. Sandboxes already holding a gateway JWT use - // `RefreshSandboxToken` instead. + // Only the K8s ServiceAccount exchange path can mint a fresh gateway JWT + // via this RPC. Local runtimes receive gateway-managed token updates + // through their compute driver or supervisor session instead. if !matches!( sandbox.source, SandboxIdentitySource::K8sServiceAccount { .. } @@ -50,7 +50,7 @@ pub async fn handle_issue_sandbox_token( "IssueSandboxToken rejected: non-bootstrap principal source" ); return Err(Status::permission_denied( - "this principal cannot mint a sandbox token; use RefreshSandboxToken", + "this principal cannot mint a sandbox token with IssueSandboxToken", )); } @@ -100,7 +100,7 @@ pub async fn handle_refresh_sandbox_token( "RefreshSandboxToken rejected: non-gateway-JWT principal source" ); return Err(Status::permission_denied( - "this principal cannot refresh; use IssueSandboxToken for bootstrap", + "this principal cannot refresh; use IssueSandboxToken for service-account exchange", )); }; @@ -313,8 +313,8 @@ mod tests { #[tokio::test] async fn refresh_rejects_k8s_sa_principal() { - // K8s SA-bootstrap principals must use IssueSandboxToken, not - // RefreshSandboxToken — the refresh path assumes a still-valid + // K8s ServiceAccount principals must use IssueSandboxToken, not + // RefreshSandboxToken, because this path assumes a still-valid // gateway-minted JWT exists. use crate::auth::principal::SandboxIdentitySource; let state = state_with_issuer().await; diff --git a/crates/openshell-server/src/lib.rs b/crates/openshell-server/src/lib.rs index 1b20ba069..f33aabfcd 100644 --- a/crates/openshell-server/src/lib.rs +++ b/crates/openshell-server/src/lib.rs @@ -336,10 +336,38 @@ pub async fn run_server( // shutdown so the running compute state matches the persisted store. // Runs before watchers spawn so the watch loop sees the post-resume // snapshot on its first poll. - if let Err(err) = state.compute.resume_persisted_sandboxes().await { + if let Err(err) = state + .compute + .resume_persisted_sandboxes(state.sandbox_jwt_issuer.as_deref()) + .await + { warn!(error = %err, "Failed to resume persisted sandboxes during startup"); } + if let Some(issuer) = state.sandbox_jwt_issuer.as_deref() { + match state + .compute + .refresh_gateway_managed_sandbox_tokens(issuer) + .await + { + Ok((_refreshed, failed)) => { + if failed > 0 { + warn!( + failed, + "Some gateway-managed sandbox JWTs failed to refresh during startup" + ); + } + } + Err(err) => warn!(error = %err, "Failed to refresh sandbox JWT files during startup"), + } + } + + if let Some(issuer) = state.sandbox_jwt_issuer.clone() { + state + .compute + .spawn_gateway_managed_sandbox_token_rotator(issuer); + } + state.compute.spawn_watchers(); ssh_sessions::spawn_session_reaper(store.clone(), Duration::from_secs(3600)); supervisor_session::spawn_relay_reaper(state.clone(), Duration::from_secs(30)); diff --git a/crates/openshell-server/src/supervisor_session.rs b/crates/openshell-server/src/supervisor_session.rs index 8f186dcac..a8c1cf55a 100644 --- a/crates/openshell-server/src/supervisor_session.rs +++ b/crates/openshell-server/src/supervisor_session.rs @@ -13,8 +13,8 @@ use tracing::{info, warn}; use uuid::Uuid; use openshell_core::proto::{ - GatewayMessage, RelayFrame, RelayInit, RelayOpen, Sandbox, SessionAccepted, SshRelayTarget, - SupervisorMessage, gateway_message, relay_open, supervisor_message, + GatewayMessage, RelayFrame, RelayInit, RelayOpen, Sandbox, SandboxTokenUpdate, SessionAccepted, + SshRelayTarget, SupervisorMessage, gateway_message, relay_open, supervisor_message, }; use crate::ServerState; @@ -201,6 +201,29 @@ impl SupervisorSessionRegistry { self.sessions.lock().unwrap().contains_key(sandbox_id) } + pub async fn send_sandbox_token_update( + &self, + sandbox_id: &str, + token: String, + expires_at_ms: i64, + ) -> Result { + let Some(tx) = self.lookup_session(sandbox_id) else { + return Ok(false); + }; + let msg = GatewayMessage { + payload: Some(gateway_message::Payload::SandboxTokenUpdate( + SandboxTokenUpdate { + token, + expires_at_ms, + }, + )), + }; + tx.send(msg) + .await + .map_err(|_| "supervisor session disconnected".to_string())?; + Ok(true) + } + fn pending_channel_ids(&self, sandbox_id: &str) -> Vec { self.pending_relays .lock() diff --git a/e2e/rust/Cargo.toml b/e2e/rust/Cargo.toml index 26957baab..17b07fdf8 100644 --- a/e2e/rust/Cargo.toml +++ b/e2e/rust/Cargo.toml @@ -46,16 +46,31 @@ name = "gateway_resume" path = "tests/gateway_resume.rs" required-features = ["e2e-docker"] +[[test]] +name = "gateway_resume_expired_token" +path = "tests/gateway_resume_expired_token.rs" +required-features = ["e2e-docker"] + [[test]] name = "podman_gateway_resume" path = "tests/podman_gateway_resume.rs" required-features = ["e2e-podman"] +[[test]] +name = "podman_gateway_resume_expired_token" +path = "tests/podman_gateway_resume_expired_token.rs" +required-features = ["e2e-podman"] + [[test]] name = "vm_gateway_resume" path = "tests/vm_gateway_resume.rs" required-features = ["e2e-vm"] +[[test]] +name = "vm_gateway_resume_expired_token" +path = "tests/vm_gateway_resume_expired_token.rs" +required-features = ["e2e-vm"] + [[test]] name = "readyz_health" path = "tests/readyz_health.rs" diff --git a/e2e/rust/e2e-docker.sh b/e2e/rust/e2e-docker.sh index a020f87c8..16bc28549 100755 --- a/e2e/rust/e2e-docker.sh +++ b/e2e/rust/e2e-docker.sh @@ -12,6 +12,11 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" E2E_TEST="${OPENSHELL_E2E_DOCKER_TEST:-smoke}" E2E_FEATURES="${OPENSHELL_E2E_DOCKER_FEATURES:-e2e,e2e-docker}" +if [ "${E2E_TEST}" = "gateway_resume_expired_token" ] \ + && [ -z "${OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS:-}" ]; then + export OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS=3 +fi + cargo build -p openshell-cli --features openshell-core/dev-settings exec "${ROOT}/e2e/with-docker-gateway.sh" \ diff --git a/e2e/rust/e2e-podman.sh b/e2e/rust/e2e-podman.sh index 5f325d0d2..b795762d1 100755 --- a/e2e/rust/e2e-podman.sh +++ b/e2e/rust/e2e-podman.sh @@ -12,6 +12,11 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" E2E_TEST="${OPENSHELL_E2E_PODMAN_TEST:-}" E2E_FEATURES="${OPENSHELL_E2E_PODMAN_FEATURES:-e2e-podman}" +if [ "${E2E_TEST}" = "podman_gateway_resume_expired_token" ] \ + && [ -z "${OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS:-}" ]; then + export OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS=3 +fi + cargo build -p openshell-cli --features openshell-core/dev-settings TEST_ARGS=( diff --git a/e2e/rust/e2e-vm.sh b/e2e/rust/e2e-vm.sh index 69cb03309..b01240a26 100755 --- a/e2e/rust/e2e-vm.sh +++ b/e2e/rust/e2e-vm.sh @@ -50,6 +50,11 @@ DRIVER_BIN="${ROOT}/target/debug/openshell-driver-vm" E2E_TEST="${OPENSHELL_E2E_VM_TEST:-smoke}" E2E_FEATURES="${OPENSHELL_E2E_VM_FEATURES:-e2e-vm}" +if [ "${E2E_TEST}" = "vm_gateway_resume_expired_token" ] \ + && [ -z "${OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS:-}" ]; then + export OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS=3 +fi + # The VM driver places `compute-driver.sock` under `[openshell.drivers.vm].state_dir`. # AF_UNIX SUN_LEN is 104 bytes on macOS (108 on Linux), so paths anchored # in the workspace's `target/` blow the limit on typical developer @@ -197,6 +202,17 @@ echo "==> Starting openshell-gateway on 127.0.0.1:${HOST_PORT} (state: ${RUN_STA # (192.168.127.1) does NOT forward arbitrary host ports. e2e_generate_gateway_jwt "${JWT_DIR}" e2e_generate_pki "${GATEWAY_BIN}" "${PKI_DIR}" +SANDBOX_JWT_TTL_SECS="${OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS:-3600}" +case "${SANDBOX_JWT_TTL_SECS}" in + ''|*[!0-9]*) + echo "ERROR: OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS must be a positive integer, got '${SANDBOX_JWT_TTL_SECS}'" >&2 + exit 1 + ;; +esac +if [ "${SANDBOX_JWT_TTL_SECS}" -lt 1 ]; then + echo "ERROR: OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS must be >= 1" >&2 + exit 1 +fi cat >"${GATEWAY_CONFIG}" < Result { + let namespace_filter = format!("label={SANDBOX_NAMESPACE_LABEL}={namespace}"); + let sandbox_name_filter = format!("label={SANDBOX_NAME_LABEL}={sandbox_name}"); + let output = Command::new("docker") + .args(["ps", "-aq", "--filter", MANAGED_BY_LABEL_FILTER, "--filter"]) + .arg(namespace_filter) + .args(["--filter"]) + .arg(sandbox_name_filter) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .map_err(|err| format!("failed to run docker ps: {err}"))?; + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{stdout}{stderr}"); + if !output.status.success() { + return Err(format!( + "docker ps failed (exit {:?}):\n{combined}", + output.status.code() + )); + } + + let ids = stdout + .lines() + .map(str::trim) + .filter(|line| !line.is_empty()) + .collect::>(); + match ids.as_slice() { + [id] => Ok((*id).to_string()), + [] => Err(format!( + "no Docker container found for sandbox '{sandbox_name}' in namespace '{namespace}'" + )), + _ => Err(format!( + "multiple Docker containers found for sandbox '{sandbox_name}' in namespace '{namespace}': {ids:?}" + )), + } +} + +/// Return whether one managed Docker sandbox container is currently running. +/// +/// # Errors +/// +/// Returns an error if Docker cannot inspect the container or reports an +/// unexpected state value. +pub fn sandbox_container_running(namespace: &str, sandbox_name: &str) -> Result { + let container_id = sandbox_container_id(namespace, sandbox_name)?; + let output = Command::new("docker") + .args(["inspect", "-f", "{{.State.Running}}", &container_id]) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .output() + .map_err(|err| format!("failed to run docker inspect: {err}"))?; + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + let combined = format!("{stdout}{stderr}"); + if !output.status.success() { + return Err(format!( + "docker inspect failed (exit {:?}):\n{combined}", + output.status.code() + )); + } + + match stdout.trim() { + "true" => Ok(true), + "false" => Ok(false), + other => Err(format!( + "unexpected Docker running state for container {container_id}: {other}" + )), + } +} + +/// Wait for a Docker sandbox container to reach the expected running state. +/// +/// # Errors +/// +/// Returns the last observed Docker state/error if the timeout elapses. +pub async fn wait_for_container_running( + namespace: &str, + sandbox_name: &str, + expected: bool, + timeout: Duration, +) -> Result<(), String> { + let start = std::time::Instant::now(); + let mut last_state: String; + + loop { + match sandbox_container_running(namespace, sandbox_name) { + Ok(running) if running == expected => return Ok(()), + Ok(running) => last_state = format!("running={running}"), + Err(err) => last_state = err, + } + + if start.elapsed() > timeout { + return Err(format!( + "sandbox container '{sandbox_name}' did not reach running={expected} within {}s. Last state: {last_state}", + timeout.as_secs() + )); + } + sleep(Duration::from_secs(1)).await; + } +} diff --git a/e2e/rust/src/harness/mod.rs b/e2e/rust/src/harness/mod.rs index f2dfd5ec9..dbc82181e 100644 --- a/e2e/rust/src/harness/mod.rs +++ b/e2e/rust/src/harness/mod.rs @@ -6,6 +6,7 @@ pub mod binary; pub mod cli; pub mod container; +pub mod docker; pub mod gateway; pub mod output; pub mod port; diff --git a/e2e/rust/tests/gateway_resume.rs b/e2e/rust/tests/gateway_resume.rs index 8f850e485..153fb84b0 100644 --- a/e2e/rust/tests/gateway_resume.rs +++ b/e2e/rust/tests/gateway_resume.rs @@ -9,112 +9,17 @@ //! `e2e/with-docker-gateway.sh`. Existing-endpoint E2E runs do not own the //! gateway process, so they skip this restart-only coverage. -use std::process::{Command, Stdio}; use std::time::Duration; use openshell_e2e::harness::cli::{ sandbox_names, wait_for_healthy, wait_for_sandbox_exec_contains, }; +use openshell_e2e::harness::docker::wait_for_container_running; use openshell_e2e::harness::gateway::ManagedGateway; use openshell_e2e::harness::sandbox::SandboxGuard; -use tokio::time::sleep; -const MANAGED_BY_LABEL_FILTER: &str = "label=openshell.ai/managed-by=openshell"; const READY_MARKER: &str = "gateway-resume-ready"; const RESUME_FILE: &str = "/sandbox/gateway-resume-state"; -const SANDBOX_NAMESPACE_LABEL: &str = "openshell.ai/sandbox-namespace"; -const SANDBOX_NAME_LABEL: &str = "openshell.ai/sandbox-name"; - -fn sandbox_container_id(namespace: &str, sandbox_name: &str) -> Result { - let namespace_filter = format!("label={SANDBOX_NAMESPACE_LABEL}={namespace}"); - let sandbox_name_filter = format!("label={SANDBOX_NAME_LABEL}={sandbox_name}"); - let output = Command::new("docker") - .args(["ps", "-aq", "--filter", MANAGED_BY_LABEL_FILTER, "--filter"]) - .arg(namespace_filter) - .args(["--filter"]) - .arg(sandbox_name_filter) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .output() - .map_err(|err| format!("failed to run docker ps: {err}"))?; - let stdout = String::from_utf8_lossy(&output.stdout); - let stderr = String::from_utf8_lossy(&output.stderr); - let combined = format!("{stdout}{stderr}"); - if !output.status.success() { - return Err(format!( - "docker ps failed (exit {:?}):\n{combined}", - output.status.code() - )); - } - - let ids = stdout - .lines() - .map(str::trim) - .filter(|line| !line.is_empty()) - .collect::>(); - match ids.as_slice() { - [id] => Ok((*id).to_string()), - [] => Err(format!( - "no Docker container found for sandbox '{sandbox_name}' in namespace '{namespace}'" - )), - _ => Err(format!( - "multiple Docker containers found for sandbox '{sandbox_name}' in namespace '{namespace}': {ids:?}" - )), - } -} - -fn sandbox_container_running(namespace: &str, sandbox_name: &str) -> Result { - let container_id = sandbox_container_id(namespace, sandbox_name)?; - let output = Command::new("docker") - .args(["inspect", "-f", "{{.State.Running}}", &container_id]) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()) - .output() - .map_err(|err| format!("failed to run docker inspect: {err}"))?; - let stdout = String::from_utf8_lossy(&output.stdout); - let stderr = String::from_utf8_lossy(&output.stderr); - let combined = format!("{stdout}{stderr}"); - if !output.status.success() { - return Err(format!( - "docker inspect failed (exit {:?}):\n{combined}", - output.status.code() - )); - } - - match stdout.trim() { - "true" => Ok(true), - "false" => Ok(false), - other => Err(format!( - "unexpected Docker running state for container {container_id}: {other}" - )), - } -} - -async fn wait_for_container_running( - namespace: &str, - sandbox_name: &str, - expected: bool, - timeout: Duration, -) -> Result<(), String> { - let start = std::time::Instant::now(); - let mut last_state: String; - - loop { - match sandbox_container_running(namespace, sandbox_name) { - Ok(running) if running == expected => return Ok(()), - Ok(running) => last_state = format!("running={running}"), - Err(err) => last_state = err, - } - - if start.elapsed() > timeout { - return Err(format!( - "sandbox container '{sandbox_name}' did not reach running={expected} within {}s. Last state: {last_state}", - timeout.as_secs() - )); - } - sleep(Duration::from_secs(1)).await; - } -} #[tokio::test] async fn docker_gateway_restart_resumes_running_sandbox() { diff --git a/e2e/rust/tests/gateway_resume_expired_token.rs b/e2e/rust/tests/gateway_resume_expired_token.rs new file mode 100644 index 000000000..5fe55e3e9 --- /dev/null +++ b/e2e/rust/tests/gateway_resume_expired_token.rs @@ -0,0 +1,105 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e")] + +//! E2E coverage for Docker startup resume after a sandbox JWT expires while +//! its container is stopped during a gateway restart. + +use std::time::Duration; + +use openshell_e2e::harness::cli::{ + sandbox_names, wait_for_healthy, wait_for_sandbox_exec_contains, +}; +use openshell_e2e::harness::docker::wait_for_container_running; +use openshell_e2e::harness::gateway::ManagedGateway; +use openshell_e2e::harness::sandbox::SandboxGuard; +use tokio::time::sleep; + +const READY_MARKER: &str = "gateway-resume-expired-token-ready"; +const RESUME_FILE: &str = "/sandbox/gateway-resume-expired-token-state"; + +fn short_sandbox_jwt_ttl_secs() -> Option { + std::env::var("OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS") + .ok() + .and_then(|value| value.parse::().ok()) + .filter(|ttl| (1..=10).contains(ttl)) +} + +#[tokio::test] +async fn docker_gateway_restart_reissues_expired_sandbox_token_before_resume() { + let Some(ttl_secs) = short_sandbox_jwt_ttl_secs() else { + eprintln!( + "Skipping expired-token gateway resume test: set OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS to 1..=10" + ); + return; + }; + let Some(gateway) = ManagedGateway::from_env().expect("load managed e2e gateway metadata") + else { + eprintln!( + "Skipping expired-token gateway resume test: e2e gateway is not managed by this test run" + ); + return; + }; + let Some(namespace) = std::env::var("OPENSHELL_E2E_DOCKER_NETWORK_NAME") + .ok() + .filter(|value| !value.trim().is_empty()) + else { + eprintln!("Skipping expired-token gateway resume test: Docker e2e namespace is unavailable"); + return; + }; + + wait_for_healthy(Duration::from_secs(30)) + .await + .expect("gateway should start healthy"); + + let script = format!( + "echo before-token-expiry > {RESUME_FILE}; echo {READY_MARKER}; while true; do sleep 1; done" + ); + let mut sandbox = SandboxGuard::create_keep(&["sh", "-lc", &script], READY_MARKER) + .await + .expect("create long-running sandbox"); + + wait_for_sandbox_exec_contains( + &sandbox.name, + &["cat", RESUME_FILE], + "before-token-expiry", + Duration::from_secs(60), + ) + .await + .expect("sandbox should be ready before gateway stop"); + wait_for_container_running(&namespace, &sandbox.name, true, Duration::from_secs(60)) + .await + .expect("sandbox container should be running before gateway restart"); + + gateway.stop().expect("stop e2e gateway"); + wait_for_container_running(&namespace, &sandbox.name, false, Duration::from_secs(120)) + .await + .expect("gateway shutdown should stop managed Docker sandboxes"); + sleep(Duration::from_secs(ttl_secs.saturating_add(2).max(5))).await; + + gateway.start().expect("restart e2e gateway"); + wait_for_healthy(Duration::from_secs(120)) + .await + .expect("gateway should become healthy after restart"); + wait_for_container_running(&namespace, &sandbox.name, true, Duration::from_secs(120)) + .await + .expect("gateway startup should resume the Docker sandbox container"); + + let names = sandbox_names().await.expect("list sandboxes after restart"); + assert!( + names.contains(&sandbox.name), + "sandbox '{}' should still be listed after gateway restart. Names: {names:?}", + sandbox.name + ); + wait_for_sandbox_exec_contains( + &sandbox.name, + &["cat", RESUME_FILE], + "before-token-expiry", + Duration::from_secs(240), + ) + .await + .expect("sandbox should reconnect after startup resume with a reissued token"); + + sandbox.cleanup().await; +} diff --git a/e2e/rust/tests/podman_gateway_resume_expired_token.rs b/e2e/rust/tests/podman_gateway_resume_expired_token.rs new file mode 100644 index 000000000..9f5605360 --- /dev/null +++ b/e2e/rust/tests/podman_gateway_resume_expired_token.rs @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e-podman")] + +//! Podman e2e coverage for gateway-owned sandbox JWT file rotation. + +use std::time::Duration; + +use openshell_e2e::harness::cli::{sandbox_names, wait_for_healthy, wait_for_sandbox_exec_contains}; +use openshell_e2e::harness::gateway::ManagedGateway; +use openshell_e2e::harness::sandbox::SandboxGuard; +use tokio::time::sleep; + +const READY_MARKER: &str = "podman-gateway-resume-expired-token-ready"; +const RESUME_FILE: &str = "/sandbox/podman-gateway-resume-expired-token-state"; + +fn short_sandbox_jwt_ttl_secs() -> Option { + std::env::var("OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS") + .ok() + .and_then(|value| value.parse::().ok()) + .filter(|ttl| (1..=10).contains(ttl)) +} + +#[tokio::test] +async fn podman_gateway_restart_reissues_expired_sandbox_token_for_running_container() { + if std::env::var("OPENSHELL_E2E_DRIVER").as_deref() != Ok("podman") { + eprintln!("Skipping Podman expired-token resume test: e2e driver is not podman"); + return; + } + let Some(ttl_secs) = short_sandbox_jwt_ttl_secs() else { + eprintln!( + "Skipping Podman expired-token resume test: set OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS to 1..=10" + ); + return; + }; + let Some(gateway) = ManagedGateway::from_env().expect("load managed e2e gateway metadata") + else { + eprintln!( + "Skipping Podman expired-token resume test: e2e gateway is not managed by this test run" + ); + return; + }; + + wait_for_healthy(Duration::from_secs(30)) + .await + .expect("gateway should start healthy"); + + let script = format!( + "echo before-token-expiry > {RESUME_FILE}; echo {READY_MARKER}; while true; do sleep 1; done" + ); + let mut sandbox = SandboxGuard::create_keep(&["sh", "-lc", &script], READY_MARKER) + .await + .expect("create long-running Podman sandbox"); + + wait_for_sandbox_exec_contains( + &sandbox.name, + &["cat", RESUME_FILE], + "before-token-expiry", + Duration::from_secs(60), + ) + .await + .expect("Podman sandbox should be ready before gateway stop"); + + gateway.stop().expect("stop e2e gateway"); + sleep(Duration::from_secs(ttl_secs.saturating_add(2).max(5))).await; + + gateway.start().expect("restart e2e gateway"); + wait_for_healthy(Duration::from_secs(120)) + .await + .expect("gateway should become healthy after restart"); + + let names = sandbox_names().await.expect("list sandboxes after restart"); + assert!( + names.contains(&sandbox.name), + "sandbox '{}' should still be listed after gateway restart. Names: {names:?}", + sandbox.name + ); + wait_for_sandbox_exec_contains( + &sandbox.name, + &["cat", RESUME_FILE], + "before-token-expiry", + Duration::from_secs(240), + ) + .await + .expect("Podman sandbox should reconnect after startup reissued the token file"); + + sandbox.cleanup().await; +} diff --git a/e2e/rust/tests/vm_gateway_resume_expired_token.rs b/e2e/rust/tests/vm_gateway_resume_expired_token.rs new file mode 100644 index 000000000..85e8733f9 --- /dev/null +++ b/e2e/rust/tests/vm_gateway_resume_expired_token.rs @@ -0,0 +1,89 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +#![cfg(feature = "e2e-vm")] + +//! VM e2e coverage for gateway-owned sandbox JWT reissue before startup resume. + +use std::time::Duration; + +use openshell_e2e::harness::cli::{sandbox_names, wait_for_healthy, wait_for_sandbox_exec_contains}; +use openshell_e2e::harness::gateway::ManagedGateway; +use openshell_e2e::harness::sandbox::SandboxGuard; +use tokio::time::sleep; + +const READY_MARKER: &str = "vm-gateway-resume-expired-token-ready"; +const RESUME_FILE: &str = "/sandbox/vm-gateway-resume-expired-token-state"; + +fn short_sandbox_jwt_ttl_secs() -> Option { + std::env::var("OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS") + .ok() + .and_then(|value| value.parse::().ok()) + .filter(|ttl| (1..=10).contains(ttl)) +} + +#[tokio::test] +async fn vm_gateway_restart_reissues_expired_sandbox_token_before_resume() { + if std::env::var("OPENSHELL_E2E_DRIVER").as_deref() != Ok("vm") { + eprintln!("Skipping VM expired-token resume test: e2e driver is not vm"); + return; + } + let Some(ttl_secs) = short_sandbox_jwt_ttl_secs() else { + eprintln!( + "Skipping VM expired-token resume test: set OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS to 1..=10" + ); + return; + }; + let Some(gateway) = ManagedGateway::from_env().expect("load managed e2e gateway metadata") + else { + eprintln!( + "Skipping VM expired-token resume test: e2e gateway is not managed by this test run" + ); + return; + }; + + wait_for_healthy(Duration::from_secs(30)) + .await + .expect("gateway should start healthy"); + + let script = format!( + "echo before-token-expiry > {RESUME_FILE}; echo {READY_MARKER}; while true; do sleep 1; done" + ); + let mut sandbox = SandboxGuard::create_keep(&["sh", "-lc", &script], READY_MARKER) + .await + .expect("create long-running VM sandbox"); + + wait_for_sandbox_exec_contains( + &sandbox.name, + &["cat", RESUME_FILE], + "before-token-expiry", + Duration::from_secs(240), + ) + .await + .expect("VM sandbox should be ready before gateway stop"); + + gateway.stop().expect("stop e2e gateway"); + sleep(Duration::from_secs(ttl_secs.saturating_add(2).max(5))).await; + + gateway.start().expect("restart e2e gateway"); + wait_for_healthy(Duration::from_secs(120)) + .await + .expect("gateway should become healthy after restart"); + + let names = sandbox_names().await.expect("list sandboxes after restart"); + assert!( + names.contains(&sandbox.name), + "sandbox '{}' should still be listed after gateway restart. Names: {names:?}", + sandbox.name + ); + wait_for_sandbox_exec_contains( + &sandbox.name, + &["cat", RESUME_FILE], + "before-token-expiry", + Duration::from_secs(240), + ) + .await + .expect("VM sandbox should reconnect after startup resume with a reissued token"); + + sandbox.cleanup().await; +} diff --git a/e2e/support/gateway-common.sh b/e2e/support/gateway-common.sh index 9766126d5..2e48baacd 100644 --- a/e2e/support/gateway-common.sh +++ b/e2e/support/gateway-common.sh @@ -147,13 +147,25 @@ e2e_generate_gateway_jwt() { e2e_write_gateway_jwt_config() { local jwt_dir=$1 local gateway_id=$2 + local ttl_secs="${OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS:-3600}" + + case "${ttl_secs}" in + ''|*[!0-9]*) + echo "ERROR: OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS must be a positive integer, got '${ttl_secs}'" >&2 + return 1 + ;; + esac + if [ "${ttl_secs}" -lt 1 ]; then + echo "ERROR: OPENSHELL_E2E_SANDBOX_JWT_TTL_SECS must be >= 1" >&2 + return 1 + fi printf '[openshell.gateway.gateway_jwt]\n' printf 'signing_key_path = %s\n' "$(e2e_toml_string "${jwt_dir}/signing.pem")" printf 'public_key_path = %s\n' "$(e2e_toml_string "${jwt_dir}/public.pem")" printf 'kid_path = %s\n' "$(e2e_toml_string "${jwt_dir}/kid")" printf 'gateway_id = %s\n' "$(e2e_toml_string "${gateway_id}")" - printf 'ttl_secs = 3600\n\n' + printf 'ttl_secs = %s\n\n' "${ttl_secs}" } e2e_write_gateway_mtls_auth_config() { diff --git a/proto/compute_driver.proto b/proto/compute_driver.proto index 6de13f3e5..76b9895f0 100644 --- a/proto/compute_driver.proto +++ b/proto/compute_driver.proto @@ -32,6 +32,14 @@ service ComputeDriver { // Provision platform resources for a sandbox. rpc CreateSandbox(CreateSandboxRequest) returns (CreateSandboxResponse); + // Resume previously provisioned platform resources without resetting + // driver-owned persistent state. + rpc ResumeSandbox(ResumeSandboxRequest) returns (ResumeSandboxResponse); + + // Write a gateway-managed sandbox token into driver-owned persistent state. + rpc WriteSandboxToken(WriteSandboxTokenRequest) + returns (WriteSandboxTokenResponse); + // Stop platform resources for a sandbox without deleting its record. rpc StopSandbox(StopSandboxRequest) returns (StopSandboxResponse); @@ -92,9 +100,10 @@ message DriverSandboxSpec { string gpu_device = 10; // Gateway-minted JWT identifying this sandbox to the gateway. Set by // the gateway on create; the driver materialises it via its native - // secret mechanism (Docker/Podman/VM bind-mount a per-sandbox file; - // the Kubernetes driver ignores this field and relies on its projected - // ServiceAccount token bootstrap instead). Never echoed to the public + // secret mechanism (Docker/Podman bind-mount a per-sandbox file; VM + // injects the file into the guest overlay; the Kubernetes driver ignores + // this field and relies on its projected ServiceAccount token bootstrap + // instead). Never echoed to the public // Sandbox proto. string sandbox_token = 11; } @@ -226,6 +235,31 @@ message CreateSandboxRequest { message CreateSandboxResponse {} +message ResumeSandboxRequest { + // Stable sandbox ID stored by the gateway. + string sandbox_id = 1; + // Compute-runtime name used by the driver. + string sandbox_name = 2; + // Fresh gateway-minted JWT for the sandbox. Empty when per-sandbox auth is + // disabled. + string sandbox_token = 3; +} + +message ResumeSandboxResponse { + // True when the driver found persisted platform state and resumed it, or the + // sandbox was already running. + bool resumed = 1; +} + +message WriteSandboxTokenRequest { + // Stable sandbox ID stored by the gateway. + string sandbox_id = 1; + // Fresh gateway-minted JWT for the sandbox. + string sandbox_token = 2; +} + +message WriteSandboxTokenResponse {} + message StopSandboxRequest { // Stable sandbox ID stored by the gateway. string sandbox_id = 1; diff --git a/proto/openshell.proto b/proto/openshell.proto index 90d1594f7..521452a98 100644 --- a/proto/openshell.proto +++ b/proto/openshell.proto @@ -227,17 +227,16 @@ service OpenShell { // Exchange a sandbox-bootstrap credential (e.g. a Kubernetes projected // ServiceAccount token) for a gateway-minted JWT bound to the calling - // sandbox's UUID. Used by the Kubernetes driver path; singleplayer - // drivers receive the gateway JWT directly from the create-sandbox flow - // and never call this RPC. + // sandbox's UUID. Used by the Kubernetes driver path for bootstrap and + // re-exchange; singleplayer drivers receive gateway JWTs directly from + // the gateway-managed create/resume/refresh flow and never call this RPC. rpc IssueSandboxToken(IssueSandboxTokenRequest) returns (IssueSandboxTokenResponse); - // Renew the calling sandbox's gateway JWT. Older tokens remain valid - // until their own expiry; deployments should keep token TTLs short to - // bound replay exposure. The supervisor calls this from a background - // task at ~80% of the token's lifetime; the new token is cached in - // memory only — the on-disk bootstrap file is intentionally not - // rewritten. + // Renew the calling sandbox's gateway JWT. Older tokens remain valid until + // their own expiry; deployments should keep token TTLs short to bound replay + // exposure. Current supervisors use explicit auth modes instead: local + // runtimes are gateway-managed, and Kubernetes supervisors re-exchange + // ServiceAccount identity through IssueSandboxToken. rpc RefreshSandboxToken(RefreshSandboxTokenRequest) returns (RefreshSandboxTokenResponse); } @@ -257,13 +256,13 @@ message IssueSandboxTokenResponse { int64 expires_at_ms = 2; } -// RefreshSandboxToken request. Empty body; the calling principal must -// already be a sandbox principal (i.e. the request carries a still-valid +// RefreshSandboxToken request. Empty body; the calling principal must already +// be a sandbox principal (i.e. the request carries a still-valid // gateway-minted JWT in its Authorization header). message RefreshSandboxTokenRequest {} -// RefreshSandboxToken response. The new token replaces the supervisor's -// in-memory bearer credential. +// RefreshSandboxToken response. Retained for direct/manual callers that still +// refresh an already-valid sandbox JWT. message RefreshSandboxTokenResponse { // Fresh gateway-minted JWT bound to the same sandbox UUID. string token = 1; @@ -1316,6 +1315,7 @@ message GatewayMessage { GatewayHeartbeat heartbeat = 3; RelayOpen relay_open = 4; RelayClose relay_close = 5; + SandboxTokenUpdate sandbox_token_update = 6; } } @@ -1347,6 +1347,16 @@ message SupervisorHeartbeat {} // Gateway heartbeat. message GatewayHeartbeat {} +// Gateway-managed sandbox JWT rotation for supervisors running in +// gateway-managed-supervisor-push mode. The supervisor persists this token for +// subsequent outbound RPCs and reconnects. +message SandboxTokenUpdate { + // Fresh gateway-minted JWT for this sandbox. + string token = 1; + // Absolute expiry of the token, milliseconds since the epoch. + int64 expires_at_ms = 2; +} + // Gateway requests the supervisor to open a relay channel. // // On receiving this, the supervisor should initiate a RelayStream RPC to