From d0a40d1ca9c17ec57a7faaa7b98b63d0982f617f Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Tue, 23 Jun 2026 22:01:15 +0000 Subject: [PATCH] Make SandboxConfig SHA256 optional to support moving runsc version. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default gvisor SandboxConfig now points at the "latest" release URL (gs://gvisor/releases/release/latest/{arch}/runsc) instead of a pinned nightly build. Because the binary behind "latest" changes with each gVisor release, the SHA256 field can no longer be hardcoded in the manifest. When SHA256 is omitted from a SandboxConfig asset, atelet downloads the binary and computes the hash on the fly. An in-memory URL->hash cache on AteomHerder prevents redundant downloads within the same atelet lifetime; restarting atelet picks up whatever "latest" currently resolves to. The computed hash is written back into the sandboxAssetsRecord before persisting, so checkpoint/restore manifests remain pinned to the exact binary that created the snapshot — the moving-target semantics apply only to new Run requests, never to restores. --- cmd/atelet/main.go | 5 + cmd/atelet/main_test.go | 66 +++++++- cmd/atelet/sandbox_assets.go | 154 +++++++++++++++--- internal/resources/validate.go | 3 + internal/resources/validate_test.go | 2 +- .../ate-install/sandboxconfig-gvisor.yaml | 6 +- pkg/api/v1alpha1/sandboxconfig_types.go | 8 +- .../v1alpha1/sandboxconfig_validation_test.go | 7 +- 8 files changed, 205 insertions(+), 46 deletions(-) diff --git a/cmd/atelet/main.go b/cmd/atelet/main.go index 7094f7315..0899a675c 100644 --- a/cmd/atelet/main.go +++ b/cmd/atelet/main.go @@ -26,6 +26,7 @@ import ( "path/filepath" "strconv" "strings" + "sync" "cloud.google.com/go/storage" "github.com/agent-substrate/substrate/cmd/atelet/internal/ategcs" @@ -187,6 +188,9 @@ type AteomHerder struct { pullCache *memorypullcache.MemoryPullCache anonGCSClient ategcs.ObjectStorage gcsClient ategcs.ObjectStorage + + urlHashMu sync.Mutex + urlHashCache map[string]string } var _ ateletpb.AteomHerderServer = (*AteomHerder)(nil) @@ -204,6 +208,7 @@ func NewService( pullCache: pullCache, anonGCSClient: anonGCSClient, gcsClient: gcsClient, + urlHashCache: make(map[string]string), } return wms } diff --git a/cmd/atelet/main_test.go b/cmd/atelet/main_test.go index 6be78fa51..db535346b 100644 --- a/cmd/atelet/main_test.go +++ b/cmd/atelet/main_test.go @@ -259,8 +259,8 @@ func TestFetchAssetRejectsBadHash(t *testing.T) { t.Fatalf("planting cache file: %v", err) } - s := &AteomHerder{} - if _, err := s.fetchAsset(context.Background(), assetEntry{SHA256: badHash}); err == nil { + s := &AteomHerder{urlHashCache: make(map[string]string)} + if _, _, err := s.fetchAsset(context.Background(), assetEntry{SHA256: badHash}); err == nil { t.Error("fetchAsset returned a cache hit for an invalid hash; validation must run before the os.Stat early return") } } @@ -292,11 +292,14 @@ func TestFetchAssetStreaming(t *testing.T) { t.Run("good asset is cached", func(t *testing.T) { ateompath.StaticFilesDir = t.TempDir() - s := &AteomHerder{anonGCSClient: fakeObjectStorage{data: content}} - path, err := s.fetchAsset(context.Background(), assetEntry{URL: url, SHA256: goodHash}) + s := &AteomHerder{anonGCSClient: fakeObjectStorage{data: content}, urlHashCache: make(map[string]string)} + path, resolvedHash, err := s.fetchAsset(context.Background(), assetEntry{URL: url, SHA256: goodHash}) if err != nil { t.Fatalf("fetchAsset: %v", err) } + if resolvedHash != goodHash { + t.Errorf("resolvedHash = %q, want %q", resolvedHash, goodHash) + } got, err := os.ReadFile(path) if err != nil { t.Fatalf("reading cached asset: %v", err) @@ -309,8 +312,8 @@ func TestFetchAssetStreaming(t *testing.T) { t.Run("over-cap asset rejected, cache not written", func(t *testing.T) { ateompath.StaticFilesDir = t.TempDir() maxAssetBytes = 4 // content is longer than this - s := &AteomHerder{anonGCSClient: fakeObjectStorage{data: content}} - if _, err := s.fetchAsset(context.Background(), assetEntry{URL: url, SHA256: goodHash}); err == nil { + s := &AteomHerder{anonGCSClient: fakeObjectStorage{data: content}, urlHashCache: make(map[string]string)} + if _, _, err := s.fetchAsset(context.Background(), assetEntry{URL: url, SHA256: goodHash}); err == nil { t.Fatal("fetchAsset accepted an over-cap asset") } if _, err := os.Stat(ateompath.RunSCBinaryPath(goodHash)); !errors.Is(err, os.ErrNotExist) { @@ -322,14 +325,61 @@ func TestFetchAssetStreaming(t *testing.T) { ateompath.StaticFilesDir = t.TempDir() maxAssetBytes = origCap wrongHash := strings.Repeat("a", 64) // valid 64-hex format, wrong value - s := &AteomHerder{anonGCSClient: fakeObjectStorage{data: content}} - if _, err := s.fetchAsset(context.Background(), assetEntry{URL: url, SHA256: wrongHash}); err == nil { + s := &AteomHerder{anonGCSClient: fakeObjectStorage{data: content}, urlHashCache: make(map[string]string)} + if _, _, err := s.fetchAsset(context.Background(), assetEntry{URL: url, SHA256: wrongHash}); err == nil { t.Fatal("fetchAsset accepted a hash mismatch") } if _, err := os.Stat(ateompath.RunSCBinaryPath(wrongHash)); !errors.Is(err, os.ErrNotExist) { t.Errorf("mismatched download left a file at the cache path (stat err = %v)", err) } }) + + t.Run("empty sha256 downloads and computes hash", func(t *testing.T) { + ateompath.StaticFilesDir = t.TempDir() + maxAssetBytes = origCap + s := &AteomHerder{anonGCSClient: fakeObjectStorage{data: content}, urlHashCache: make(map[string]string)} + path, resolvedHash, err := s.fetchAsset(context.Background(), assetEntry{URL: url, SHA256: ""}) + if err != nil { + t.Fatalf("fetchAsset: %v", err) + } + if resolvedHash != goodHash { + t.Errorf("resolvedHash = %q, want %q", resolvedHash, goodHash) + } + got, err := os.ReadFile(path) + if err != nil { + t.Fatalf("reading cached asset: %v", err) + } + if !bytes.Equal(got, content) { + t.Errorf("cached bytes = %q, want %q", got, content) + } + }) + + t.Run("empty sha256 uses in-memory cache on second call", func(t *testing.T) { + ateompath.StaticFilesDir = t.TempDir() + maxAssetBytes = origCap + s := &AteomHerder{anonGCSClient: fakeObjectStorage{data: content}, urlHashCache: make(map[string]string)} + _, _, err := s.fetchAsset(context.Background(), assetEntry{URL: url, SHA256: ""}) + if err != nil { + t.Fatalf("first fetchAsset: %v", err) + } + // Replace the GCS client with one that errors — the in-memory cache + // should prevent a second download. + s.anonGCSClient = fakeObjectStorage{err: fmt.Errorf("should not be called")} + path2, resolvedHash2, err := s.fetchAsset(context.Background(), assetEntry{URL: url, SHA256: ""}) + if err != nil { + t.Fatalf("second fetchAsset should hit cache: %v", err) + } + if resolvedHash2 != goodHash { + t.Errorf("resolvedHash = %q, want %q", resolvedHash2, goodHash) + } + got, err := os.ReadFile(path2) + if err != nil { + t.Fatalf("reading cached asset: %v", err) + } + if !bytes.Equal(got, content) { + t.Errorf("cached bytes = %q, want %q", got, content) + } + }) } // TestRPCBoundariesReject confirms each of the three RPCs validates path inputs diff --git a/cmd/atelet/sandbox_assets.go b/cmd/atelet/sandbox_assets.go index 876a4b9f0..5d31cbf4b 100644 --- a/cmd/atelet/sandbox_assets.go +++ b/cmd/atelet/sandbox_assets.go @@ -86,81 +86,183 @@ func recordFromRequest(sa *ateletpb.SandboxAssets) (*sandboxAssetsRecord, error) // local path. For gVisor this is the single "runsc" asset, passed to ateom as // RunscPath. Binaries are content-addressed and cached, so re-fetching at // Checkpoint/Restore is a no-op once present. +// +// When the asset's SHA256 is empty (the SandboxConfig omitted it), the binary +// is downloaded and hashed on the fly; the resolved hash is written back into +// rec so that writeSandboxRecord persists the real hash for checkpoint/restore. func (s *AteomHerder) ensureSandboxBinary(ctx context.Context, rec *sandboxAssetsRecord) (string, error) { if err := os.MkdirAll(ateompath.StaticFilesDir, 0o700); err != nil { return "", fmt.Errorf("while creating static files dir: %w", err) } - // gVisor uses a single "runsc" asset. entry, ok := rec.Assets["runsc"] if !ok { return "", status.Errorf(codes.InvalidArgument, "sandbox assets for class %q missing required %q file", rec.SandboxClass, "runsc") } - return s.fetchAsset(ctx, entry) + path, resolvedHash, err := s.fetchAsset(ctx, entry) + if err != nil { + return "", err + } + if entry.SHA256 != resolvedHash { + entry.SHA256 = resolvedHash + rec.Assets["runsc"] = entry + } + return path, nil } -// fetchAsset downloads one content-addressed asset (verifying its sha256) into -// the shared static-files cache and returns its local path. On a cache hit it -// returns immediately. -func (s *AteomHerder) fetchAsset(ctx context.Context, entry assetEntry) (string, error) { +// fetchAsset downloads one content-addressed asset into the shared static-files +// cache and returns its local path and resolved SHA256. When entry.SHA256 is +// provided, the download is verified against the expected hash. When empty, the +// hash is computed on the fly and an in-memory URL→hash cache avoids redundant +// downloads within the same atelet process lifetime. +func (s *AteomHerder) fetchAsset(ctx context.Context, entry assetEntry) (string, string, error) { if err := resources.ValidateRunscHash(entry.SHA256); err != nil { - return "", status.Error(codes.InvalidArgument, err.Error()) + return "", "", status.Error(codes.InvalidArgument, err.Error()) } + if entry.SHA256 != "" { + return s.fetchAssetPinned(ctx, entry) + } + return s.fetchAssetUnpinned(ctx, entry) +} + +// fetchAssetPinned handles the case where the expected SHA256 is known: check +// the disk cache, download on miss, and verify the hash. +func (s *AteomHerder) fetchAssetPinned(ctx context.Context, entry assetEntry) (string, string, error) { localPath := ateompath.RunSCBinaryPath(entry.SHA256) _, err := os.Stat(localPath) - if err == nil { // EQUALS nil - return localPath, nil + if err == nil { + return localPath, entry.SHA256, nil } else if !errors.Is(err, os.ErrNotExist) { - return "", fmt.Errorf("while stat-ing local file: %w", err) + return "", "", fmt.Errorf("while stat-ing local file: %w", err) } - // gVisor's runsc lives in the public gs://gvisor bucket, so the anonymous - // client suffices. TODO: drive authenticated asset fetches from atelet - // configuration for assets in private buckets. - rc, err := ategcs.Open(ctx, s.anonGCSClient, entry.URL) + wantSum, err := hex.DecodeString(entry.SHA256) + if err != nil { + return "", "", fmt.Errorf("while parsing sha256 hash: %w", err) + } + + gotHash, err := s.downloadAsset(ctx, entry.URL, localPath, wantSum) if err != nil { - return "", fmt.Errorf("while fetching %v: %w", entry.URL, err) + return "", "", err + } + return localPath, gotHash, nil +} + +// fetchAssetUnpinned handles the case where no SHA256 was provided: consult the +// in-memory URL→hash cache first, then download and compute the hash on the fly. +func (s *AteomHerder) fetchAssetUnpinned(ctx context.Context, entry assetEntry) (string, string, error) { + s.urlHashMu.Lock() + cachedHash := s.urlHashCache[entry.URL] + s.urlHashMu.Unlock() + + if cachedHash != "" { + localPath := ateompath.RunSCBinaryPath(cachedHash) + if _, err := os.Stat(localPath); err == nil { + return localPath, cachedHash, nil + } + } + + localPath, computedHash, err := s.downloadAndCache(ctx, entry.URL) + if err != nil { + return "", "", err + } + + s.urlHashMu.Lock() + s.urlHashCache[entry.URL] = computedHash + s.urlHashMu.Unlock() + + return localPath, computedHash, nil +} + +// downloadAndCache downloads an asset to a temp file while computing its SHA256, +// then places it in the content-addressed cache. If a file with the computed +// hash already exists on disk, the download is discarded and the existing file +// is returned. +func (s *AteomHerder) downloadAndCache(ctx context.Context, url string) (string, string, error) { + rc, err := ategcs.Open(ctx, s.anonGCSClient, url) + if err != nil { + return "", "", fmt.Errorf("while fetching %v: %w", url, err) } defer rc.Close() - wantSum, err := hex.DecodeString(entry.SHA256) + tmpFile, err := os.CreateTemp(ateompath.StaticFilesDir, "runsc-download-") if err != nil { - return "", fmt.Errorf("while parsing sha256 hash: %w", err) + return "", "", fmt.Errorf("while creating temp file: %w", err) } + tmpName := tmpFile.Name() + defer os.Remove(tmpName) + defer tmpFile.Close() + + hasher := sha256.New() + n, err := io.Copy(io.MultiWriter(tmpFile, hasher), io.LimitReader(rc, maxAssetBytes+1)) + if err != nil { + return "", "", fmt.Errorf("while downloading %v: %w", url, err) + } + if n > maxAssetBytes { + return "", "", fmt.Errorf("asset %v exceeds %d-byte cap", url, maxAssetBytes) + } + + computedHash := hex.EncodeToString(hasher.Sum(nil)) + localPath := ateompath.RunSCBinaryPath(computedHash) + + if _, err := os.Stat(localPath); err == nil { + return localPath, computedHash, nil + } + + if err := tmpFile.Chmod(0o755); err != nil { + return "", "", fmt.Errorf("while setting file mode: %w", err) + } + if err := tmpFile.Close(); err != nil { + return "", "", fmt.Errorf("while closing temp file: %w", err) + } + if err := os.Rename(tmpName, localPath); err != nil { + return "", "", fmt.Errorf("while renaming temp file to target: %w", err) + } + + return localPath, computedHash, nil +} + +// downloadAsset downloads a URL to localPath, verifying the content against +// wantSum. Returns the hex-encoded hash of the downloaded content. +func (s *AteomHerder) downloadAsset(ctx context.Context, url, localPath string, wantSum []byte) (string, error) { + rc, err := ategcs.Open(ctx, s.anonGCSClient, url) + if err != nil { + return "", fmt.Errorf("while fetching %v: %w", url, err) + } + defer rc.Close() tmpFile, err := os.CreateTemp(filepath.Dir(localPath), filepath.Base(localPath)+"-download-") if err != nil { return "", fmt.Errorf("while creating temp file: %w", err) } tmpName := tmpFile.Name() - defer os.Remove(tmpName) // partial-download cleanup; no-op after rename + defer os.Remove(tmpName) defer tmpFile.Close() - // Stream to disk, hashing as we go; +1 lets an over-cap asset trip n > cap. - // Verify-after-copy keeps a bad download at the temp path, never the cache. hasher := sha256.New() n, err := io.Copy(io.MultiWriter(tmpFile, hasher), io.LimitReader(rc, maxAssetBytes+1)) if err != nil { - return "", fmt.Errorf("while downloading %v: %w", entry.URL, err) + return "", fmt.Errorf("while downloading %v: %w", url, err) } if n > maxAssetBytes { - return "", fmt.Errorf("asset %v exceeds %d-byte cap", entry.URL, maxAssetBytes) + return "", fmt.Errorf("asset %v exceeds %d-byte cap", url, maxAssetBytes) } - if got := hasher.Sum(nil); !bytes.Equal(got, wantSum) { - return "", fmt.Errorf("sha256 mismatch; got=%x want=%s", got, entry.SHA256) + got := hasher.Sum(nil) + if !bytes.Equal(got, wantSum) { + return "", fmt.Errorf("sha256 mismatch; got=%x want=%x", got, wantSum) } if err := tmpFile.Chmod(0o755); err != nil { return "", fmt.Errorf("while setting file mode: %w", err) } - if err := tmpFile.Close(); err != nil { // flush before rename + if err := tmpFile.Close(); err != nil { return "", fmt.Errorf("while closing temp file: %w", err) } if err := os.Rename(tmpName, localPath); err != nil { return "", fmt.Errorf("while renaming temp file to target: %w", err) } - return localPath, nil + return hex.EncodeToString(got), nil } // writeSandboxRecord persists the actor's running sandbox assets on-node so a diff --git a/internal/resources/validate.go b/internal/resources/validate.go index 3855fe856..477aa89f0 100644 --- a/internal/resources/validate.go +++ b/internal/resources/validate.go @@ -101,6 +101,9 @@ func ValidateContainerNames(names []string) error { // point the cache-hit early return (and the download target) at an arbitrary // binary outside the static-files dir. func ValidateRunscHash(sha256Hash string) error { + if sha256Hash == "" { + return nil + } if len(sha256Hash) != 64 { return fmt.Errorf("invalid runsc sha256 hash: want 64 hex chars, got %d", len(sha256Hash)) } diff --git a/internal/resources/validate_test.go b/internal/resources/validate_test.go index a525ae15f..200360e25 100644 --- a/internal/resources/validate_test.go +++ b/internal/resources/validate_test.go @@ -121,7 +121,7 @@ func TestValidateRunscHash(t *testing.T) { }{ {"valid lowercase", valid, false}, {"valid uppercase", strings.ToUpper(valid), false}, - {"empty", "", true}, + {"empty", "", false}, {"too short", "abc123", true}, {"too long", valid + "00", true}, {"separator", strings.Repeat("a", 60) + "/../", true}, diff --git a/manifests/ate-install/sandboxconfig-gvisor.yaml b/manifests/ate-install/sandboxconfig-gvisor.yaml index 62f310d71..7478471e1 100644 --- a/manifests/ate-install/sandboxconfig-gvisor.yaml +++ b/manifests/ate-install/sandboxconfig-gvisor.yaml @@ -27,9 +27,7 @@ spec: assets: amd64: runsc: - url: "gs://gvisor/releases/nightly/2026-05-19/x86_64/runsc" - sha256: "a397be1abc2420d26bce6c70e6e2ff96c73aaaab929756c56f5e2089ea842b63" + url: "gs://gvisor/releases/release/latest/x86_64/runsc" arm64: runsc: - url: "gs://gvisor/releases/nightly/2026-05-19/aarch64/runsc" - sha256: "1ba2366ae2efceba166046f51a4104f9261c9cb72c6db8f5b3fe2dc57dea86b9" + url: "gs://gvisor/releases/release/latest/aarch64/runsc" diff --git a/pkg/api/v1alpha1/sandboxconfig_types.go b/pkg/api/v1alpha1/sandboxconfig_types.go index fbaad908d..c1c201f4d 100644 --- a/pkg/api/v1alpha1/sandboxconfig_types.go +++ b/pkg/api/v1alpha1/sandboxconfig_types.go @@ -43,10 +43,12 @@ type AssetFile struct { // SHA256 is the lower-case hex SHA256 of the asset. It both names the cached // file (preventing collisions) and verifies the download's integrity. + // When omitted, atelet downloads the asset, computes the hash on the fly, + // and uses it for caching and the snapshot manifest. // - // +required - // +kubebuilder:validation:Pattern=`^[a-f0-9]{64}$` - SHA256 string `json:"sha256"` + // +optional + // +kubebuilder:validation:Pattern=`^([a-f0-9]{64})?$` + SHA256 string `json:"sha256,omitempty"` } // SandboxConfigSpec is the desired state of a SandboxConfig. diff --git a/pkg/api/v1alpha1/sandboxconfig_validation_test.go b/pkg/api/v1alpha1/sandboxconfig_validation_test.go index 632c55263..17d4adba5 100644 --- a/pkg/api/v1alpha1/sandboxconfig_validation_test.go +++ b/pkg/api/v1alpha1/sandboxconfig_validation_test.go @@ -127,10 +127,9 @@ func TestSandboxConfigValidation(t *testing.T) { wantErr: true, errMsg: "url", }, { - name: "asset missing sha256", - sc: sandboxConfig("bad-no-sha", SandboxClassGvisor, map[string]map[string]AssetFile{"amd64": {"runsc": {URL: "gs://bucket/runsc"}}}), - wantErr: true, - errMsg: "sha256", + name: "valid gvisor with runsc, no sha256", + sc: sandboxConfig("ok-no-sha", SandboxClassGvisor, map[string]map[string]AssetFile{"amd64": {"runsc": {URL: "gs://bucket/runsc"}}}), + wantErr: false, }, { name: "asset sha256 not 64 hex", sc: sandboxConfig("bad-sha", SandboxClassGvisor, map[string]map[string]AssetFile{"amd64": {"runsc": {URL: "gs://bucket/runsc", SHA256: "deadbeef"}}}),