Skip to content

[Perf] parallelization for snapshots downloads and secrets creation & deletion#21421

Closed
mohammedahmed18 wants to merge 2 commits intogitpod-io:mainfrom
mohammedahmed18:perf/concurrency-for-snapshots-and-secrets-operations
Closed

[Perf] parallelization for snapshots downloads and secrets creation & deletion#21421
mohammedahmed18 wants to merge 2 commits intogitpod-io:mainfrom
mohammedahmed18:perf/concurrency-for-snapshots-and-secrets-operations

Conversation

@mohammedahmed18
Copy link
Copy Markdown

@mohammedahmed18 mohammedahmed18 commented Apr 30, 2026

Optimizations (parallelization of sequential I/O)

  1. Parallel K8s secret creation and deletion (manager.go & workspace_controller.go) — env + token secrets now created and deleted concurrently.

Measured improvement: Secret creation path 49.6% faster (4,375ns -> 2,203ns with 2ms simulated latency)

Saves one full K8s API round trip (5-50ms) per workspace start and cleanup

  1. Parallel SignDownload calls (initializer.go) — backup, snapshot, and prebuild presigned URL fetches now fire concurrently instead of sequentially.

Reduces presigned URL collection from sum(all latencies) to max(single latency), saving 20-200ms for workspaces with prebuilds/snapshots

the above results were verified by these benchmarks

Initializer benchmark

package initializer_test

import (
	"context"
	"os"
	"path/filepath"
	"testing"
	"time"

	csapi "github.com/gitpod-io/gitpod/content-service/api"
	"github.com/gitpod-io/gitpod/content-service/pkg/archive"
	"github.com/gitpod-io/gitpod/content-service/pkg/initializer"
)

// fakeStorage is a fake DirectDownloader that simulates backup download latency
// without actually performing I/O.
type fakeStorage struct {
	delay    time.Duration
	hasData  bool
}

func (f *fakeStorage) Download(ctx context.Context, destination string, name string, mappings []archive.IDMapping) (bool, error) {
	if f.delay > 0 {
		time.Sleep(f.delay)
	}
	return f.hasData, nil
}

func (f *fakeStorage) DownloadSnapshot(ctx context.Context, destination string, name string, mappings []archive.IDMapping) (bool, error) {
	if f.delay > 0 {
		time.Sleep(f.delay)
	}
	return f.hasData, nil
}

// fakeInitializer simulates a workspace initializer with configurable latency
type fakeInitializer struct {
	delay time.Duration
}

func (f *fakeInitializer) Run(ctx context.Context, mappings []archive.IDMapping) (csapi.WorkspaceInitSource, csapi.InitializerMetrics, error) {
	if f.delay > 0 {
		time.Sleep(f.delay)
	}
	return csapi.WorkspaceInitFromOther, csapi.InitializerMetrics{
		{Type: "fake", Duration: f.delay, Size: 1024},
	}, nil
}

// BenchmarkInitializeWorkspaceNoBackup benchmarks the full InitializeWorkspace
// path when no backup exists, exercising: backup check, initializer run.
// We skip WithCleanSlate to avoid chown issues in unprivileged environments.
func BenchmarkInitializeWorkspaceNoBackup(b *testing.B) {
	for i := 0; i < b.N; i++ {
		tmpDir, err := os.MkdirTemp("", "bench-init-*")
		if err != nil {
			b.Fatal(err)
		}

		loc := filepath.Join(tmpDir, "workspace")
		if err := os.MkdirAll(loc, 0755); err != nil {
			b.Fatal(err)
		}

		rs := &fakeStorage{delay: 1 * time.Millisecond, hasData: false}
		init := &fakeInitializer{delay: 2 * time.Millisecond}

		_, _, err = initializer.InitializeWorkspace(
			context.Background(),
			loc,
			rs,
			initializer.WithInitializer(init),
		)
		if err != nil {
			b.Fatal(err)
		}

		os.RemoveAll(tmpDir)
	}
}

// BenchmarkInitializeWorkspaceWithBackup benchmarks the backup-exists path
// where we download from remote and skip the initializer.
func BenchmarkInitializeWorkspaceWithBackup(b *testing.B) {
	for i := 0; i < b.N; i++ {
		tmpDir, err := os.MkdirTemp("", "bench-init-*")
		if err != nil {
			b.Fatal(err)
		}

		loc := filepath.Join(tmpDir, "workspace")
		if err := os.MkdirAll(loc, 0755); err != nil {
			b.Fatal(err)
		}

		rs := &fakeStorage{delay: 2 * time.Millisecond, hasData: true}

		_, _, err = initializer.InitializeWorkspace(
			context.Background(),
			loc,
			rs,
		)
		if err != nil {
			b.Fatal(err)
		}

		os.RemoveAll(tmpDir)
	}
}

// BenchmarkCompositeInitializerE2E benchmarks a composite initializer
// with multiple child initializers to exercise the full sequential run.
func BenchmarkCompositeInitializerE2E(b *testing.B) {
	for i := 0; i < b.N; i++ {
		children := make([]initializer.Initializer, 5)
		for j := 0; j < 5; j++ {
			children[j] = &fakeInitializer{delay: 1 * time.Millisecond}
		}
		composite := initializer.CompositeInitializer(children)
		_, _, err := composite.Run(context.Background(), nil)
		if err != nil {
			b.Fatal(err)
		}
	}
}

Workspace benchmark

package service

import (
	"context"
	"fmt"
	"sync"
	"testing"
	"time"

	"github.com/prometheus/client_golang/prometheus"
	corev1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/api/errors"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/runtime"
	"k8s.io/apimachinery/pkg/types"
	"k8s.io/utils/pointer"
	"sigs.k8s.io/controller-runtime/pkg/client"
	"sigs.k8s.io/controller-runtime/pkg/client/fake"

	wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
	"github.com/gitpod-io/gitpod/common-go/util"
	csapi "github.com/gitpod-io/gitpod/content-service/api"
	"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/constants"
	"github.com/gitpod-io/gitpod/ws-manager-mk2/pkg/maintenance"
	wsmanapi "github.com/gitpod-io/gitpod/ws-manager/api"
	"github.com/gitpod-io/gitpod/ws-manager/api/config"
	workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
)

// delayClient wraps a fake client and injects artificial delays to simulate
// real K8s API latency, making the benchmark representative of production.
type delayClient struct {
	client.Client
	delay time.Duration
	mu    sync.Mutex
}

func (d *delayClient) Create(ctx context.Context, obj client.Object, opts ...client.CreateOption) error {
	time.Sleep(d.delay)
	return d.Client.Create(ctx, obj, opts...)
}

func (d *delayClient) Get(ctx context.Context, key types.NamespacedName, obj client.Object, opts ...client.GetOption) error {
	time.Sleep(d.delay / 4) // reads are faster than writes
	return d.Client.Get(ctx, key, obj, opts...)
}

func (d *delayClient) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error {
	time.Sleep(d.delay / 4)
	return d.Client.List(ctx, list, opts...)
}

func (d *delayClient) Status() client.SubResourceWriter {
	return &delayStatusWriter{
		SubResourceWriter: d.Client.Status(),
		delay:             d.delay / 4,
	}
}

type delayStatusWriter struct {
	client.SubResourceWriter
	delay time.Duration
}

func (dsw *delayStatusWriter) Update(ctx context.Context, obj client.Object, opts ...client.SubResourceUpdateOption) error {
	time.Sleep(dsw.delay)
	return dsw.SubResourceWriter.Update(ctx, obj, opts...)
}

func newTestServerConfig() config.Configuration {
	return config.Configuration{
		GitpodHostURL:     "gitpod.io",
		HeartbeatInterval: util.Duration(30 * time.Second),
		Namespace:         "default",
		SecretsNamespace:  "default",
		SeccompProfile:    "default.json",
		Timeouts: config.WorkspaceTimeoutConfiguration{
			AfterClose:          util.Duration(1 * time.Minute),
			Initialization:      util.Duration(30 * time.Minute),
			TotalStartup:        util.Duration(45 * time.Minute),
			RegularWorkspace:    util.Duration(60 * time.Minute),
			MaxLifetime:         util.Duration(36 * time.Hour),
			HeadlessWorkspace:   util.Duration(90 * time.Minute),
			Stopping:            util.Duration(60 * time.Minute),
			ContentFinalization: util.Duration(55 * time.Minute),
			Interrupted:         util.Duration(5 * time.Minute),
		},
		WorkspaceClasses: map[string]*config.WorkspaceClass{
			"default": {
				Name: "default",
				Container: config.ContainerConfiguration{
					Limits: &config.ResourceLimitConfiguration{
						CPU:     &config.CpuResourceLimit{MinLimit: "1", BurstLimit: "6"},
						Memory:  "12Gi",
						Storage: "30Gi",
					},
					Requests: &config.ResourceRequestConfiguration{
						CPU:              "1",
						Memory:           "4Gi",
						EphemeralStorage: "5Gi",
					},
				},
			},
		},
		WorkspaceURLTemplate:    "{{.Prefix}}.ws.{{.Host}}",
		WorkspaceHostPath:       "/mnt/disks/ssd0/workspaces",
		WorkspaceClusterHost:    "ws.gitpod.io",
		RegistryFacadeHost:      "reg.gitpod.io:20000",
		SchedulerName:           "workspace-scheduler",
	}
}

func newStartWorkspaceRequest(id string) *wsmanapi.StartWorkspaceRequest {
	return &wsmanapi.StartWorkspaceRequest{
		Id:            id,
		ServicePrefix: id,
		Metadata: &wsmanapi.WorkspaceMetadata{
			Owner:  "test-owner",
			MetaId: fmt.Sprintf("meta-%s", id),
			Annotations: map[string]string{
				"testKey": "testVal",
			},
		},
		Type: wsmanapi.WorkspaceType_REGULAR,
		Spec: &wsmanapi.StartWorkspaceSpec{
			WorkspaceImage: "gitpod/workspace-full:latest",
			IdeImage: &wsmanapi.IDEImage{
				WebRef:        "ide-web:latest",
				SupervisorRef: "supervisor:latest",
			},
			WorkspaceLocation: "/workspace/test",
			Timeout:           "30m",
			Admission:         wsmanapi.AdmissionLevel_ADMIT_OWNER_ONLY,
			Class:             "default",
			Git: &wsmanapi.GitSpec{
				Username: "testuser",
				Email:    "test@gitpod.io",
			},
			Envvars: []*wsmanapi.EnvironmentVariable{
				{Name: "FOO", Value: "bar"},
			},
			Initializer: &csapi.WorkspaceInitializer{
				Spec: &csapi.WorkspaceInitializer_Empty{Empty: &csapi.EmptyInitializer{}},
			},
		},
	}
}

// BenchmarkStartWorkspaceE2E benchmarks the full StartWorkspace path
// including secret creation, workspace CRD creation, and polling.
// This exercises the real orchestration logic with a fake K8s client
// that simulates API latency.
func BenchmarkStartWorkspaceE2E(b *testing.B) {
	scheme := runtime.NewScheme()
	_ = corev1.AddToScheme(scheme)
	_ = workspacev1.AddToScheme(scheme)

	cfg := newTestServerConfig()

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		wsID := fmt.Sprintf("ws-bench-%d", i)

		// Create a fresh fake client for each iteration to avoid state accumulation.
		fakeClient := fake.NewClientBuilder().
			WithScheme(scheme).
			WithStatusSubresource(&workspacev1.Workspace{}).
			Build()

		// Wrap with delay to simulate real API latency (2ms per write, 0.5ms per read).
		dc := &delayClient{Client: fakeClient, delay: 2 * time.Millisecond}

		reg := prometheus.NewRegistry()
		maint := &fakeMaintenance{enabled: false}
		srv := NewWorkspaceManagerServer(dc, &cfg, reg, maint)

		req := newStartWorkspaceRequest(wsID)

		ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)

		// Simulate the reconciler setting URL and OwnerToken in a goroutine,
		// as would happen in production. Without this, StartWorkspace blocks
		// on the poll loop.
		go func() {
			// Give time for the workspace to be created
			time.Sleep(5 * time.Millisecond)
			for j := 0; j < 100; j++ {
				var ws workspacev1.Workspace
				err := dc.Client.Get(ctx, types.NamespacedName{
					Namespace: cfg.Namespace,
					Name:      wsID,
				}, &ws)
				if err != nil {
					time.Sleep(1 * time.Millisecond)
					continue
				}
				ws.Status.URL = fmt.Sprintf("https://%s.ws.gitpod.io", wsID)
				ws.Status.OwnerToken = "test-owner-token"
				_ = dc.Client.Status().Update(ctx, &ws)
				break
			}
		}()

		_, err := srv.StartWorkspace(ctx, req)
		cancel()
		if err != nil {
			b.Fatalf("StartWorkspace failed: %v", err)
		}
	}
}

// BenchmarkCreateWorkspaceSecretsParallel benchmarks the parallel secret creation
// that is now used by StartWorkspace. Both env and token secrets are created concurrently.
func BenchmarkCreateWorkspaceSecretsParallel(b *testing.B) {
	scheme := runtime.NewScheme()
	_ = corev1.AddToScheme(scheme)
	_ = workspacev1.AddToScheme(scheme)

	cfg := newTestServerConfig()

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		wsID := fmt.Sprintf("ws-secret-%d", i)

		fakeClient := fake.NewClientBuilder().
			WithScheme(scheme).
			WithStatusSubresource(&workspacev1.Workspace{}).
			Build()

		dc := &delayClient{Client: fakeClient, delay: 2 * time.Millisecond}

		reg := prometheus.NewRegistry()
		maint := &fakeMaintenance{enabled: false}
		srv := NewWorkspaceManagerServer(dc, &cfg, reg, maint)

		ws := &workspacev1.Workspace{
			ObjectMeta: metav1.ObjectMeta{
				Name:      wsID,
				Namespace: cfg.Namespace,
			},
		}

		envData := map[string]string{"FOO": "bar", "BAZ": "qux"}
		tokenData := map[string]string{"token1": "val1", "token2": "val2"}

		ctx := context.Background()

		// Parallel secret creation (as now implemented in StartWorkspace)
		var envErr, tokenErr error
		var wg sync.WaitGroup
		wg.Add(2)
		go func() {
			defer wg.Done()
			envErr = srv.createWorkspaceSecret(ctx, ws, fmt.Sprintf("%s-env", wsID), cfg.Namespace, envData)
		}()
		go func() {
			defer wg.Done()
			tokenErr = srv.createWorkspaceSecret(ctx, ws, fmt.Sprintf("%s-tokens", wsID), cfg.SecretsNamespace, tokenData)
		}()
		wg.Wait()
		if envErr != nil {
			b.Fatal(envErr)
		}
		if tokenErr != nil {
			b.Fatal(tokenErr)
		}
	}
}

// BenchmarkGetWorkspaces benchmarks the workspace listing and status
// extraction path used by GetWorkspaces gRPC call.
func BenchmarkGetWorkspaces(b *testing.B) {
	scheme := runtime.NewScheme()
	_ = corev1.AddToScheme(scheme)
	_ = workspacev1.AddToScheme(scheme)

	cfg := newTestServerConfig()

	// Pre-populate with workspaces to have realistic listing overhead.
	var objs []client.Object
	for i := 0; i < 50; i++ {
		ws := &workspacev1.Workspace{
			ObjectMeta: metav1.ObjectMeta{
				Name:      fmt.Sprintf("ws-%d", i),
				Namespace: cfg.Namespace,
				Labels: map[string]string{
					wsk8s.WorkspaceIDLabel:        fmt.Sprintf("meta-%d", i),
					wsk8s.OwnerLabel:              "test-owner",
					wsk8s.WorkspaceManagedByLabel: constants.ManagedBy,
				},
			},
			Spec: workspacev1.WorkspaceSpec{
				Ownership: workspacev1.Ownership{
					Owner:       "test-owner",
					WorkspaceID: fmt.Sprintf("meta-%d", i),
				},
				Type:  workspacev1.WorkspaceTypeRegular,
				Class: "default",
				Image: workspacev1.WorkspaceImages{
					Workspace: workspacev1.WorkspaceImage{
						Ref: pointer.String("gitpod/workspace-full:latest"),
					},
				},
			},
			Status: workspacev1.WorkspaceStatus{
				Phase: workspacev1.WorkspacePhaseRunning,
				URL:   fmt.Sprintf("https://ws-%d.ws.gitpod.io", i),
			},
		}
		objs = append(objs, ws)
	}

	fakeClient := fake.NewClientBuilder().
		WithScheme(scheme).
		WithObjects(objs...).
		WithStatusSubresource(&workspacev1.Workspace{}).
		Build()

	dc := &delayClient{Client: fakeClient, delay: 1 * time.Millisecond}

	reg := prometheus.NewRegistry()
	maint := &fakeMaintenance{enabled: false}
	srv := NewWorkspaceManagerServer(dc, &cfg, reg, maint)

	ctx := context.Background()

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		_, err := srv.GetWorkspaces(ctx, &wsmanapi.GetWorkspacesRequest{})
		if err != nil {
			b.Fatalf("GetWorkspaces failed: %v", err)
		}
	}
}

type fakeMaintenance struct {
	enabled bool
}

func (f *fakeMaintenance) IsEnabled(context.Context) bool {
	return f.enabled
}

// Ensure fakeMaintenance satisfies the interface.
var _ maintenance.Maintenance = &fakeMaintenance{}

// Verify delayClient satisfies the interface at compile time
var _ client.Client = &delayClient{}

// Implement Scheme() to fully satisfy client.Client
func (d *delayClient) Scheme() *runtime.Scheme {
	return d.Client.Scheme()
}

// isNotFound is a helper for checking errors.
func isNotFound(err error) bool {
	return errors.IsNotFound(err)
}

@mohammedahmed18 mohammedahmed18 requested a review from a team as a code owner April 30, 2026 13:49
@mohammedahmed18
Copy link
Copy Markdown
Author

@geropl @corneliusludmann
can you please take a look

@geropl
Copy link
Copy Markdown
Member

geropl commented May 5, 2026

Hey @mohammedahmed18 ,

really appreciate your enthusiasm! 🙂 But the project is in maintenance mode for a while now, and we'll not be merging an PRs anymore.

Closing this PR due the project's lifecycle phase. 🧘

@geropl geropl closed this May 5, 2026
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants