diff --git a/cmd/containerd-shim-lcow-v2/containerd-shim-lcow-v2.exe.manifest b/cmd/containerd-shim-lcow-v2/containerd-shim-lcow-v2.exe.manifest
new file mode 100644
index 0000000000..9c5ba67277
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/containerd-shim-lcow-v2.exe.manifest
@@ -0,0 +1,17 @@
+
+
+ containerd-shim-lcow-v2
+
+
+
+
+
+
+
+
+
+ true
+
+
+
+
diff --git a/cmd/containerd-shim-lcow-v2/main.go b/cmd/containerd-shim-lcow-v2/main.go
new file mode 100644
index 0000000000..9951144a8a
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/main.go
@@ -0,0 +1,93 @@
+//go:build windows
+
+// containerd-shim-lcow-v2 is a containerd shim implementation for Linux Containers on Windows (LCOW).
+package main
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "io"
+ "os"
+
+ "github.com/Microsoft/hcsshim/cmd/containerd-shim-lcow-v2/service"
+ _ "github.com/Microsoft/hcsshim/cmd/containerd-shim-lcow-v2/service/plugin"
+ runhcsopts "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options"
+ "github.com/Microsoft/hcsshim/internal/log"
+ "github.com/Microsoft/hcsshim/internal/oc"
+ "github.com/Microsoft/hcsshim/internal/shim"
+
+ "github.com/containerd/errdefs"
+ "github.com/sirupsen/logrus"
+ "go.opencensus.io/trace"
+)
+
+// Add a manifest to get proper Windows version detection.
+//go:generate go tool github.com/josephspurrier/goversioninfo/cmd/goversioninfo -platform-specific
+
+func main() {
+ logrus.AddHook(log.NewHook())
+
+ // Register our OpenCensus logrus exporter so that trace spans are emitted via logrus.
+ trace.ApplyConfig(trace.Config{DefaultSampler: oc.DefaultSampler})
+ trace.RegisterExporter(&oc.LogrusExporter{})
+
+ logrus.SetFormatter(log.NopFormatter{})
+ logrus.SetOutput(io.Discard)
+
+ // Set the log configuration.
+ // If we encounter an error, we exit with non-zero code.
+ if err := setLogConfiguration(); err != nil {
+ _, _ = fmt.Fprintf(os.Stderr, "%s: %s", service.ShimName, err)
+ os.Exit(1)
+ }
+
+ // Start the shim manager event loop. The manager is responsible for
+ // handling containerd start/stop lifecycle calls for the shim process.
+ shim.Run(context.Background(), newShimManager(service.ShimName), func(c *shim.Config) {
+ // We don't want the shim package to set up logging options.
+ c.NoSetupLogger = true
+ })
+}
+
+// setLogConfiguration reads the runtime options from stdin and sets the log configuration.
+// We only set up the log configuration for serve action.
+func setLogConfiguration() error {
+ // We set up the log configuration in the serve action only.
+ // This is because we want to avoid reading the stdin in start action,
+ // so that we can pass it along to the invocation for serve action.
+ if len(os.Args) > 1 && os.Args[len(os.Args)-1] == "serve" {
+ // The serve process is started with stderr pointing to panic.log file.
+ // We want to keep that file only for pure Go panics. Any explicit writes
+ // to os.Stderr should go to stdout instead, which is connected to the parent's
+ // stderr for regular logging.
+ // We can safely redirect os.Stderr to os.Stdout because in case of panics,
+ // the Go runtime will write the panic stack trace directly to the file descriptor,
+ // bypassing os.Stderr, so it will still go to panic.log.
+ os.Stderr = os.Stdout
+
+ opts, err := shim.ReadRuntimeOptions[*runhcsopts.Options](os.Stdin)
+ if err != nil {
+ if !errors.Is(err, errdefs.ErrNotFound) {
+ return fmt.Errorf("failed to read runtime options from stdin: %w", err)
+ }
+ }
+
+ if opts != nil {
+ if opts.LogLevel != "" {
+ // If log level is specified, set the corresponding logrus logging level.
+ lvl, err := logrus.ParseLevel(opts.LogLevel)
+ if err != nil {
+ return fmt.Errorf("failed to parse shim log level %q: %w", opts.LogLevel, err)
+ }
+ logrus.SetLevel(lvl)
+ }
+
+ if opts.ScrubLogs {
+ log.SetScrubbing(true)
+ }
+ }
+ _ = os.Stdin.Close()
+ }
+ return nil
+}
diff --git a/cmd/containerd-shim-lcow-v2/manager.go b/cmd/containerd-shim-lcow-v2/manager.go
new file mode 100644
index 0000000000..e0a997dcc5
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/manager.go
@@ -0,0 +1,295 @@
+//go:build windows
+
+package main
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "io"
+ "os"
+ "os/exec"
+ "path/filepath"
+ "syscall"
+ "time"
+
+ runhcsopts "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options"
+ "github.com/Microsoft/hcsshim/internal/hcs"
+ "github.com/Microsoft/hcsshim/internal/memory"
+ "github.com/Microsoft/hcsshim/internal/oc"
+ "github.com/Microsoft/hcsshim/internal/shim"
+ hcsversion "github.com/Microsoft/hcsshim/internal/version"
+
+ "github.com/containerd/containerd/api/types"
+ "github.com/containerd/containerd/v2/pkg/namespaces"
+ "github.com/containerd/errdefs"
+ "github.com/containerd/typeurl/v2"
+ "github.com/opencontainers/runtime-spec/specs-go"
+ "github.com/sirupsen/logrus"
+ "golang.org/x/sys/windows"
+)
+
+const (
+ // addrFmt is the format of the address used for containerd shim.
+ addrFmt = "\\\\.\\pipe\\ProtectedPrefix\\Administrators\\containerd-shim-%s-%s-pipe"
+
+ // serveReadyEventNameFormat is the format string used to construct the named Windows event
+ // that signals when the child "serve" process is ready to accept ttrpc connections.
+ // It is formatted with the namespace and shim ID (e.g. "-").
+ serveReadyEventNameFormat = "%s-%s"
+)
+
+// shimManager implements the shim.Manager interface. It is the entry-point
+// used by the containerd shim runner to create and destroy shim instances.
+type shimManager struct {
+ name string
+}
+
+// Verify that shimManager implements shim.Manager interface
+var _ shim.Manager = (*shimManager)(nil)
+
+// newShimManager returns a shimManager with the given binary name.
+func newShimManager(name string) *shimManager {
+ return &shimManager{
+ name: name,
+ }
+}
+
+// newCommand builds the exec.Cmd that will be used to spawn the long-running
+// "serve" child process.
+func newCommand(ctx context.Context,
+ id,
+ containerdAddress,
+ socketAddr string,
+ stderr io.Writer,
+) (*exec.Cmd, error) {
+ ns, err := namespaces.NamespaceRequired(ctx)
+ if err != nil {
+ return nil, err
+ }
+ self, err := os.Executable()
+ if err != nil {
+ return nil, err
+ }
+ cwd, err := os.Getwd()
+ if err != nil {
+ return nil, err
+ }
+
+ args := []string{
+ "-namespace", ns,
+ "-id", id,
+ "-address", containerdAddress,
+ "-socket", socketAddr,
+ "serve",
+ }
+ cmd := exec.Command(self, args...)
+ cmd.Dir = cwd
+ // Limit Go runtime parallelism in the child to avoid excessive CPU usage.
+ cmd.Env = append(os.Environ(), "GOMAXPROCS=4")
+ // Place the child in its own process group so OS signals (e.g. Ctrl-C)
+ // sent to the parent are not automatically forwarded to the child.
+ cmd.SysProcAttr = &syscall.SysProcAttr{
+ CreationFlags: windows.CREATE_NEW_PROCESS_GROUP,
+ }
+ cmd.Stdin = os.Stdin
+ cmd.Stdout = os.Stderr
+ cmd.Stderr = stderr
+
+ return cmd, nil
+}
+
+// Name returns the name of the shim
+func (m *shimManager) Name() string {
+ return m.name
+}
+
+// Start starts a shim instance for 'containerd-shim-lcow-v2'.
+// This shim relies on containerd's Sandbox API to start a sandbox.
+// There can be following scenarios that will launch a shim-
+//
+// 1. Containerd Sandbox Controller calls the Start command to start
+// the sandbox for the pod. All the container create requests will
+// set the SandboxID via `WithSandbox` ContainerOpts. Thereby, the
+// container create request within the pod will be routed directly to the
+// shim without calling the start command again.
+//
+// NOTE: This shim will not support routing the create request to an existing
+// shim based on annotations like `io.kubernetes.cri.sandbox-id`.
+func (m *shimManager) Start(ctx context.Context, id string, opts shim.StartOpts) (_ shim.BootstrapParams, retErr error) {
+ // We cant write anything to stdout/stderr for this cmd.
+ logrus.SetOutput(io.Discard)
+
+ var params shim.BootstrapParams
+ params.Version = 3
+ params.Protocol = "ttrpc"
+
+ cwd, err := os.Getwd()
+ if err != nil {
+ return params, fmt.Errorf("failed to get current working directory: %w", err)
+ }
+
+ f, err := os.Create(filepath.Join(cwd, "panic.log"))
+ if err != nil {
+ return params, fmt.Errorf("failed to create panic log file: %w", err)
+ }
+ defer f.Close()
+
+ ns, err := namespaces.NamespaceRequired(ctx)
+ if err != nil {
+ return params, fmt.Errorf("failed to get namespace from context: %w", err)
+ }
+
+ // Create an event on which we will listen to know when the shim is ready to accept connections.
+ // The child serve process signals this event once its TTRPC server is fully initialized.
+ eventName, _ := windows.UTF16PtrFromString(fmt.Sprintf(serveReadyEventNameFormat, ns, id))
+
+ // Create the named event
+ handle, err := windows.CreateEvent(nil, 0, 0, eventName)
+ if err != nil {
+ return params, fmt.Errorf("failed to create event: %w", err)
+ }
+ defer func() {
+ _ = windows.CloseHandle(handle)
+ }()
+
+ // address is the named pipe address that the shim will use to serve the ttrpc service.
+ address := fmt.Sprintf(addrFmt, ns, id)
+
+ // Create the serve command.
+ cmd, err := newCommand(ctx, id, opts.Address, address, f)
+ if err != nil {
+ return params, err
+ }
+
+ if err = cmd.Start(); err != nil {
+ return params, err
+ }
+
+ defer func() {
+ if retErr != nil {
+ _ = cmd.Process.Kill()
+ }
+ }()
+
+ // Block until the child signals the event.
+ _, _ = windows.WaitForSingleObject(handle, windows.INFINITE)
+
+ params.Address = address
+ return params, nil
+}
+
+// Stop tears down a running shim instance identified by id.
+// It reads and logs any panic messages written to panic.log, then tries to
+// terminate the associated HCS compute system and waits up to 30 seconds for
+// it to exit.
+func (m *shimManager) Stop(_ context.Context, id string) (resp shim.StopStatus, err error) {
+ ctx, span := oc.StartSpan(context.Background(), "delete")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ var bundlePath string
+ if opts, ok := ctx.Value(shim.OptsKey{}).(shim.Opts); ok {
+ bundlePath = opts.BundlePath
+ }
+
+ if bundlePath == "" {
+ return resp, fmt.Errorf("bundle path not found in context")
+ }
+
+ // hcsshim shim writes panic logs in the bundle directory in a file named "panic.log"
+ // log those messages (if any) on stderr so that it shows up in containerd's log.
+ // This should be done as the first thing so that we don't miss any panic logs even if
+ // something goes wrong during delete op.
+ // The file can be very large so read only first 1MB of data.
+ readLimit := int64(memory.MiB) // 1MB
+ logBytes, err := limitedRead(filepath.Join(bundlePath, "panic.log"), readLimit)
+ if err == nil && len(logBytes) > 0 {
+ if int64(len(logBytes)) == readLimit {
+ logrus.Warnf("shim panic log file %s is larger than 1MB, logging only first 1MB", filepath.Join(bundlePath, "panic.log"))
+ }
+ logrus.WithField("log", string(logBytes)).Warn("found shim panic logs during delete")
+ } else if err != nil && !errors.Is(err, os.ErrNotExist) {
+ logrus.WithError(err).Warn("failed to open shim panic log")
+ }
+
+ // Attempt to find the hcssystem for this bundle and terminate it.
+ if sys, _ := hcs.OpenComputeSystem(ctx, id); sys != nil {
+ defer sys.Close()
+ if err := sys.Terminate(ctx); err != nil {
+ fmt.Fprintf(os.Stderr, "failed to terminate '%s': %v", id, err)
+ } else {
+ ch := make(chan error, 1)
+ go func() { ch <- sys.Wait() }()
+ t := time.NewTimer(time.Second * 30)
+ select {
+ case <-t.C:
+ sys.Close()
+ return resp, fmt.Errorf("timed out waiting for '%s' to terminate", id)
+ case err := <-ch:
+ t.Stop()
+ if err != nil {
+ fmt.Fprintf(os.Stderr, "failed to wait for '%s' to terminate: %v", id, err)
+ }
+ }
+ }
+ }
+
+ resp = shim.StopStatus{
+ ExitedAt: time.Now(),
+ // 255 exit code is used by convention to indicate unknown exit reason.
+ ExitStatus: 255,
+ }
+ return resp, nil
+}
+
+// limitedRead reads at max `readLimitBytes` bytes from the file at path `filePath`. If the file has
+// more than `readLimitBytes` bytes of data then first `readLimitBytes` will be returned.
+// Read at most readLimitBytes so delete does not flood logs.
+func limitedRead(filePath string, readLimitBytes int64) ([]byte, error) {
+ f, err := os.Open(filePath)
+ if err != nil {
+ return nil, fmt.Errorf("limited read failed to open file: %s: %w", filePath, err)
+ }
+ defer f.Close()
+ fi, err := f.Stat()
+ if err != nil {
+ return []byte{}, fmt.Errorf("limited read failed during file stat: %s: %w", filePath, err)
+ }
+ if fi.Size() < readLimitBytes {
+ readLimitBytes = fi.Size()
+ }
+ buf := make([]byte, readLimitBytes)
+ _, err = f.Read(buf)
+ if err != nil {
+ return []byte{}, fmt.Errorf("limited read failed during file read: %s: %w", filePath, err)
+ }
+ return buf, nil
+}
+
+// Info returns runtime information about this shim including its name, version,
+// git commit, OCI spec version, and any runtime options decoded from optionsR.
+func (m *shimManager) Info(_ context.Context, optionsR io.Reader) (*types.RuntimeInfo, error) {
+ info := &types.RuntimeInfo{
+ Name: m.name,
+ Version: &types.RuntimeVersion{
+ Version: fmt.Sprintf("%s\ncommit: %s\nspec: %s", hcsversion.Version, hcsversion.Commit, specs.Version),
+ },
+ Annotations: nil,
+ }
+
+ opts, err := shim.ReadRuntimeOptions[*runhcsopts.Options](optionsR)
+ if err != nil {
+ if !errors.Is(err, errdefs.ErrNotFound) {
+ return nil, fmt.Errorf("failed to read runtime options (*options.Options): %w", err)
+ }
+ }
+ if opts != nil {
+ info.Options, err = typeurl.MarshalAnyToProto(opts)
+ if err != nil {
+ return nil, fmt.Errorf("failed to marshal %T: %w", opts, err)
+ }
+ }
+
+ return info, nil
+}
diff --git a/cmd/containerd-shim-lcow-v2/manager_test.go b/cmd/containerd-shim-lcow-v2/manager_test.go
new file mode 100644
index 0000000000..fd93692299
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/manager_test.go
@@ -0,0 +1,46 @@
+//go:build windows
+
+package main
+
+import (
+ "os"
+ "path/filepath"
+ "testing"
+)
+
+// TestLimitedRead verifies that limitedRead correctly enforces the byte limit
+// when the file is larger than the limit, and reads the full content when the
+// file is smaller than the limit.
+func TestLimitedRead(t *testing.T) {
+ dir := t.TempDir()
+ filePath := filepath.Join(dir, "panic.log")
+ content := []byte("hello")
+ if err := os.WriteFile(filePath, content, 0o644); err != nil {
+ t.Fatalf("WriteFile: %v", err)
+ }
+
+ buf, err := limitedRead(filePath, 2)
+ if err != nil {
+ t.Fatalf("limitedRead: %v", err)
+ }
+ if string(buf) != "he" {
+ t.Fatalf("expected 'he', got %q", string(buf))
+ }
+
+ buf, err = limitedRead(filePath, 10)
+ if err != nil {
+ t.Fatalf("limitedRead: %v", err)
+ }
+ if string(buf) != "hello" {
+ t.Fatalf("expected 'hello', got %q", string(buf))
+ }
+}
+
+// TestLimitedReadMissingFile verifies that limitedRead returns an error when
+// the target file does not exist.
+func TestLimitedReadMissingFile(t *testing.T) {
+ _, err := limitedRead(filepath.Join(t.TempDir(), "missing.log"), 10)
+ if err == nil {
+ t.Fatalf("expected error for missing file")
+ }
+}
diff --git a/cmd/containerd-shim-lcow-v2/resource_windows_386.syso b/cmd/containerd-shim-lcow-v2/resource_windows_386.syso
new file mode 100644
index 0000000000..5510dc97e2
Binary files /dev/null and b/cmd/containerd-shim-lcow-v2/resource_windows_386.syso differ
diff --git a/cmd/containerd-shim-lcow-v2/resource_windows_amd64.syso b/cmd/containerd-shim-lcow-v2/resource_windows_amd64.syso
new file mode 100644
index 0000000000..2c00dedb25
Binary files /dev/null and b/cmd/containerd-shim-lcow-v2/resource_windows_amd64.syso differ
diff --git a/cmd/containerd-shim-lcow-v2/resource_windows_arm.syso b/cmd/containerd-shim-lcow-v2/resource_windows_arm.syso
new file mode 100644
index 0000000000..2706f485e1
Binary files /dev/null and b/cmd/containerd-shim-lcow-v2/resource_windows_arm.syso differ
diff --git a/cmd/containerd-shim-lcow-v2/resource_windows_arm64.syso b/cmd/containerd-shim-lcow-v2/resource_windows_arm64.syso
new file mode 100644
index 0000000000..718ad2bfb8
Binary files /dev/null and b/cmd/containerd-shim-lcow-v2/resource_windows_arm64.syso differ
diff --git a/cmd/containerd-shim-lcow-v2/service/plugin/plugin.go b/cmd/containerd-shim-lcow-v2/service/plugin/plugin.go
new file mode 100644
index 0000000000..dd7c1124ae
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/service/plugin/plugin.go
@@ -0,0 +1,115 @@
+//go:build windows
+
+package plugin
+
+import (
+ "context"
+ "os"
+
+ "github.com/Microsoft/hcsshim/cmd/containerd-shim-lcow-v2/service"
+ "github.com/Microsoft/hcsshim/internal/shim"
+ "github.com/Microsoft/hcsshim/internal/shimdiag"
+ hcsversion "github.com/Microsoft/hcsshim/internal/version"
+
+ "github.com/Microsoft/go-winio/pkg/etw"
+ "github.com/Microsoft/go-winio/pkg/etwlogrus"
+ "github.com/Microsoft/go-winio/pkg/guid"
+ "github.com/containerd/containerd/v2/pkg/shutdown"
+ "github.com/containerd/containerd/v2/plugins"
+ "github.com/containerd/plugin"
+ "github.com/containerd/plugin/registry"
+ "github.com/sirupsen/logrus"
+)
+
+const (
+ // etwProviderName is the ETW provider name for lcow shim.
+ etwProviderName = "Microsoft.Virtualization.containerd-shim-lcow-v2"
+)
+
+// svc holds the single Service instance created during plugin initialization.
+var svc *service.Service
+
+func init() {
+ // Provider ID: 64F6FC7F-8326-5EE8-B890-3734AE584136
+ // Provider and hook aren't closed explicitly, as they will exist until process exit.
+ provider, err := etw.NewProvider(etwProviderName, etwCallback)
+ if err != nil {
+ logrus.Error(err)
+ } else {
+ if hook, err := etwlogrus.NewHookFromProvider(provider); err == nil {
+ logrus.AddHook(hook)
+ } else {
+ logrus.Error(err)
+ }
+ }
+
+ // Write the "ShimLaunched" event with the shim's command-line arguments.
+ _ = provider.WriteEvent(
+ "ShimLaunched",
+ nil,
+ etw.WithFields(
+ etw.StringArray("Args", os.Args),
+ etw.StringField("Version", hcsversion.Version),
+ etw.StringField("GitCommit", hcsversion.Commit),
+ ),
+ )
+
+ // Register the shim's TTRPC plugin with the containerd plugin registry.
+ // The plugin depends on the event publisher (for publishing task/sandbox
+ // events to containerd) and the internal shutdown service (for co-ordinated
+ // graceful teardown).
+ registry.Register(&plugin.Registration{
+ Type: plugins.TTRPCPlugin,
+ ID: "shim-services",
+ Requires: []plugin.Type{
+ plugins.EventPlugin,
+ plugins.InternalPlugin,
+ },
+ InitFn: func(ic *plugin.InitContext) (interface{}, error) {
+ pp, err := ic.GetByID(plugins.EventPlugin, "publisher")
+ if err != nil {
+ return nil, err
+ }
+ ss, err := ic.GetByID(plugins.InternalPlugin, "shutdown")
+ if err != nil {
+ return nil, err
+ }
+ // We will register all the services namely-
+ // 1. Sandbox service
+ // 2. Task service
+ // 3. Shimdiag service
+ svc = service.NewService(
+ ic.Context,
+ pp.(shim.Publisher),
+ ss.(shutdown.Service),
+ )
+
+ return svc, nil
+ },
+ })
+}
+
+// etwCallback is the ETW callback method for this shim.
+//
+// On a CaptureState notification (triggered by tools such as wpr or xperf) it
+// dumps all goroutine stacks – both host-side Go stacks and, when available,
+// the guest Linux stacks – to the logrus logger tagged with the sandbox ID.
+// This provides an out-of-band diagnostic snapshot without requiring the shim
+// to be paused or restarted.
+func etwCallback(sourceID guid.GUID, state etw.ProviderState, level etw.Level, matchAnyKeyword uint64, matchAllKeyword uint64, filterData uintptr) {
+ if state == etw.ProviderStateCaptureState {
+ if svc == nil {
+ logrus.Warn("service not initialized")
+ return
+ }
+ resp, err := svc.DiagStacks(context.Background(), &shimdiag.StacksRequest{})
+ if err != nil {
+ return
+ }
+ log := logrus.WithField("sandboxID", svc.SandboxID())
+ log.WithField("stack", resp.Stacks).Info("goroutine stack dump")
+ if resp.GuestStacks != "" {
+ log.WithField("stack", resp.GuestStacks).Info("guest stack dump")
+ }
+ }
+}
diff --git a/cmd/containerd-shim-lcow-v2/service/service.go b/cmd/containerd-shim-lcow-v2/service/service.go
new file mode 100644
index 0000000000..3c2968c53f
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/service/service.go
@@ -0,0 +1,120 @@
+//go:build windows
+
+package service
+
+import (
+ "context"
+ "sync"
+
+ "github.com/Microsoft/hcsshim/internal/builder/vm/lcow"
+ "github.com/Microsoft/hcsshim/internal/controller/vm"
+ "github.com/Microsoft/hcsshim/internal/log"
+ "github.com/Microsoft/hcsshim/internal/shim"
+ "github.com/Microsoft/hcsshim/internal/shimdiag"
+
+ sandboxsvc "github.com/containerd/containerd/api/runtime/sandbox/v1"
+ tasksvc "github.com/containerd/containerd/api/runtime/task/v3"
+ "github.com/containerd/containerd/v2/core/runtime"
+ "github.com/containerd/containerd/v2/pkg/namespaces"
+ "github.com/containerd/containerd/v2/pkg/shutdown"
+ "github.com/containerd/ttrpc"
+)
+
+const (
+ // ShimName is the name of the LCOW shim implementation.
+ ShimName = "containerd-shim-lcow-v2"
+)
+
+// Service is the shared Service struct that implements all TTRPC Service interfaces.
+// All Service methods (sandbox, task, and shimdiag) operate on this shared struct.
+type Service struct {
+ // mu is used to synchronize access to shared state within the Service.
+ mu sync.Mutex
+
+ // publisher is used to publish events from the shim to containerd.
+ publisher shim.Publisher
+ // events is a buffered channel used to queue events before they are published to containerd.
+ events chan interface{}
+
+ // sandboxID is the unique identifier for the sandbox managed by this Service instance.
+ // For LCOW shim, sandboxID corresponds 1-1 with the UtilityVM managed by the shim.
+ sandboxID string
+
+ // sandboxOptions contains parsed, shim-level configuration for the sandbox
+ // such as architecture and confidential-compute settings.
+ sandboxOptions *lcow.SandboxOptions
+
+ // vmController is responsible for managing the lifecycle of the underlying utility VM and its associated resources.
+ vmController vm.Controller
+
+ // shutdown manages graceful shutdown operations and allows registration of cleanup callbacks.
+ shutdown shutdown.Service
+}
+
+var _ shim.TTRPCService = (*Service)(nil)
+
+// NewService creates a new instance of the Service with the shared state.
+func NewService(ctx context.Context, eventsPublisher shim.Publisher, sd shutdown.Service) *Service {
+ svc := &Service{
+ publisher: eventsPublisher,
+ events: make(chan interface{}, 128), // Buffered channel for events
+ vmController: vm.NewController(),
+ shutdown: sd,
+ }
+
+ go svc.forward(ctx, eventsPublisher)
+
+ // Register a shutdown callback to close the events channel,
+ // which signals the forward goroutine to exit.
+ sd.RegisterCallback(func(context.Context) error {
+ close(svc.events)
+ return nil
+ })
+
+ // Perform best-effort VM cleanup on shutdown.
+ sd.RegisterCallback(func(ctx context.Context) error {
+ _ = svc.vmController.TerminateVM(ctx)
+ return nil
+ })
+
+ return svc
+}
+
+// RegisterTTRPC registers the Task, Sandbox, and ShimDiag TTRPC services on
+// the provided server so that containerd can call into the shim over TTRPC.
+func (s *Service) RegisterTTRPC(server *ttrpc.Server) error {
+ tasksvc.RegisterTTRPCTaskService(server, s)
+ sandboxsvc.RegisterTTRPCSandboxService(server, s)
+ shimdiag.RegisterShimDiagService(server, s)
+ return nil
+}
+
+// SandboxID returns the unique identifier for the sandbox managed by this Service.
+func (s *Service) SandboxID() string {
+ return s.sandboxID
+}
+
+// send enqueues an event onto the internal events channel so that it can be
+// forwarded to containerd asynchronously by the forward goroutine.
+//
+// TODO: wire up send() for task events once task lifecycle methods are implemented.
+//
+//nolint:unused
+func (s *Service) send(evt interface{}) {
+ s.events <- evt
+}
+
+// forward runs in a dedicated goroutine and publishes events from the internal
+// events channel to containerd using the provided Publisher. It exits when the
+// events channel is closed (which happens during graceful shutdown).
+func (s *Service) forward(ctx context.Context, publisher shim.Publisher) {
+ ns, _ := namespaces.Namespace(ctx)
+ ctx = namespaces.WithNamespace(context.Background(), ns)
+ for e := range s.events {
+ err := publisher.Publish(ctx, runtime.GetTopic(e), e)
+ if err != nil {
+ log.G(ctx).WithError(err).Error("post event")
+ }
+ }
+ _ = publisher.Close()
+}
diff --git a/cmd/containerd-shim-lcow-v2/service/service_sandbox.go b/cmd/containerd-shim-lcow-v2/service/service_sandbox.go
new file mode 100644
index 0000000000..3e2a3ea437
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/service/service_sandbox.go
@@ -0,0 +1,170 @@
+//go:build windows
+
+package service
+
+import (
+ "context"
+
+ "github.com/Microsoft/hcsshim/internal/log"
+ "github.com/Microsoft/hcsshim/internal/logfields"
+ "github.com/Microsoft/hcsshim/internal/oc"
+
+ "github.com/containerd/containerd/api/runtime/sandbox/v1"
+ "github.com/containerd/errdefs/pkg/errgrpc"
+ "github.com/sirupsen/logrus"
+ "go.opencensus.io/trace"
+)
+
+// Ensure Service implements the TTRPCSandboxService interface at compile time.
+var _ sandbox.TTRPCSandboxService = &Service{}
+
+// CreateSandbox creates (or prepares) a new sandbox for the given SandboxID.
+// This method is part of the instrumentation layer and business logic is included in createSandboxInternal.
+func (s *Service) CreateSandbox(ctx context.Context, request *sandbox.CreateSandboxRequest) (resp *sandbox.CreateSandboxResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "CreateSandbox")
+ defer span.End()
+ defer func() {
+ oc.SetSpanStatus(span, err)
+ }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, request.SandboxID),
+ trace.StringAttribute(logfields.Bundle, request.BundlePath),
+ trace.StringAttribute(logfields.NetNsPath, request.NetnsPath),
+ )
+
+ // Set the sandbox ID in the logger context for all subsequent logs in this request.
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.SandboxID, request.SandboxID))
+
+ r, e := s.createSandboxInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// StartSandbox transitions a previously created sandbox to the "running" state.
+// This method is part of the instrumentation layer and business logic is included in startSandboxInternal.
+func (s *Service) StartSandbox(ctx context.Context, request *sandbox.StartSandboxRequest) (resp *sandbox.StartSandboxResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "StartSandbox")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(trace.StringAttribute(logfields.SandboxID, request.SandboxID))
+
+ // Set the sandbox ID in the logger context for all subsequent logs in this request.
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.SandboxID, request.SandboxID))
+
+ r, e := s.startSandboxInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Platform returns the platform details for the sandbox ("windows/amd64" or "linux/amd64").
+// This method is part of the instrumentation layer and business logic is included in platformInternal.
+func (s *Service) Platform(ctx context.Context, request *sandbox.PlatformRequest) (resp *sandbox.PlatformResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "Platform")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(trace.StringAttribute(logfields.SandboxID, request.SandboxID))
+
+ r, e := s.platformInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// StopSandbox attempts a graceful stop of the sandbox within the specified timeout.
+// This method is part of the instrumentation layer and business logic is included in stopSandboxInternal.
+func (s *Service) StopSandbox(ctx context.Context, request *sandbox.StopSandboxRequest) (resp *sandbox.StopSandboxResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "StopSandbox")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(trace.StringAttribute(logfields.SandboxID, request.SandboxID))
+ span.AddAttributes(trace.Int64Attribute(logfields.Timeout, int64(request.TimeoutSecs)))
+
+ // Set the sandbox ID in the logger context for all subsequent logs in this request.
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.SandboxID, request.SandboxID))
+
+ r, e := s.stopSandboxInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// WaitSandbox blocks until the sandbox reaches a terminal state (stopped/errored) and returns the outcome.
+// This method is part of the instrumentation layer and business logic is included in waitSandboxInternal.
+func (s *Service) WaitSandbox(ctx context.Context, request *sandbox.WaitSandboxRequest) (resp *sandbox.WaitSandboxResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "WaitSandbox")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(trace.StringAttribute(logfields.SandboxID, request.SandboxID))
+
+ // Set the sandbox ID in the logger context for all subsequent logs in this request.
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.SandboxID, request.SandboxID))
+
+ r, e := s.waitSandboxInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// SandboxStatus returns current status for the sandbox, optionally verbose.
+// This method is part of the instrumentation layer and business logic is included in sandboxStatusInternal.
+func (s *Service) SandboxStatus(ctx context.Context, request *sandbox.SandboxStatusRequest) (resp *sandbox.SandboxStatusResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "SandboxStatus")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(trace.StringAttribute(logfields.SandboxID, request.SandboxID))
+ span.AddAttributes(trace.BoolAttribute(logfields.Verbose, request.Verbose))
+
+ // Set the sandbox ID in the logger context for all subsequent logs in this request.
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.SandboxID, request.SandboxID))
+
+ r, e := s.sandboxStatusInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// PingSandbox performs a minimal liveness check on the sandbox and returns quickly.
+// This method is part of the instrumentation layer and business logic is included in pingSandboxInternal.
+func (s *Service) PingSandbox(ctx context.Context, request *sandbox.PingRequest) (resp *sandbox.PingResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "PingSandbox")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(trace.StringAttribute(logfields.SandboxID, request.SandboxID))
+
+ // Set the sandbox ID in the logger context for all subsequent logs in this request.
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.SandboxID, request.SandboxID))
+
+ r, e := s.pingSandboxInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// ShutdownSandbox requests a full shim + sandbox shutdown (stronger than StopSandbox),
+// typically used by the higher-level controller to tear down resources and exit the shim.
+// This method is part of the instrumentation layer and business logic is included in shutdownSandboxInternal.
+func (s *Service) ShutdownSandbox(ctx context.Context, request *sandbox.ShutdownSandboxRequest) (resp *sandbox.ShutdownSandboxResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "ShutdownSandbox")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(trace.StringAttribute(logfields.SandboxID, request.SandboxID))
+
+ // Set the sandbox ID in the logger context for all subsequent logs in this request.
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.SandboxID, request.SandboxID))
+
+ r, e := s.shutdownSandboxInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// SandboxMetrics returns runtime metrics for the sandbox (e.g., CPU/memory/IO),
+// suitable for monitoring and autoscaling decisions.
+// This method is part of the instrumentation layer and business logic is included in sandboxMetricsInternal.
+func (s *Service) SandboxMetrics(ctx context.Context, request *sandbox.SandboxMetricsRequest) (resp *sandbox.SandboxMetricsResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "SandboxMetrics")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(trace.StringAttribute(logfields.SandboxID, request.SandboxID))
+
+ // Set the sandbox ID in the logger context for all subsequent logs in this request.
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.SandboxID, request.SandboxID))
+
+ r, e := s.sandboxMetricsInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
diff --git a/cmd/containerd-shim-lcow-v2/service/service_sandbox_internal.go b/cmd/containerd-shim-lcow-v2/service/service_sandbox_internal.go
new file mode 100644
index 0000000000..f8cd3dfd97
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/service/service_sandbox_internal.go
@@ -0,0 +1,321 @@
+//go:build windows
+
+package service
+
+import (
+ "context"
+ "encoding/json"
+ "fmt"
+ "os"
+ "path/filepath"
+ "time"
+
+ "github.com/Microsoft/hcsshim/internal/builder/vm/lcow"
+ "github.com/Microsoft/hcsshim/internal/controller/vm"
+ "github.com/Microsoft/hcsshim/internal/gcs/prot"
+ "github.com/Microsoft/hcsshim/internal/log"
+ "github.com/Microsoft/hcsshim/internal/protocol/guestresource"
+ "github.com/Microsoft/hcsshim/internal/vm/vmutils"
+ vmsandbox "github.com/Microsoft/hcsshim/sandbox-spec/vm/v2"
+
+ "github.com/Microsoft/go-winio"
+ "github.com/containerd/containerd/api/runtime/sandbox/v1"
+ "github.com/containerd/containerd/api/types"
+ "github.com/containerd/errdefs"
+ "github.com/containerd/typeurl/v2"
+ "golang.org/x/sys/windows"
+ "google.golang.org/protobuf/types/known/timestamppb"
+)
+
+const (
+ // linuxPlatform refers to the Linux guest OS platform.
+ linuxPlatform = "linux"
+
+ // SandboxStateReady indicates the sandbox is ready.
+ SandboxStateReady = "SANDBOX_READY"
+ // SandboxStateNotReady indicates the sandbox is not ready.
+ SandboxStateNotReady = "SANDBOX_NOTREADY"
+)
+
+// createSandboxInternal is the implementation for CreateSandbox.
+//
+// It enforces that only one sandbox can exist per shim instance (this shim
+// follows a one-sandbox-per-shim model). It builds the HCS compute-system
+// document from the sandbox spec and delegates VM creation to vmController.
+func (s *Service) createSandboxInternal(ctx context.Context, request *sandbox.CreateSandboxRequest) (*sandbox.CreateSandboxResponse, error) {
+ // Decode the Sandbox spec passed along from CRI.
+ var sandboxSpec vmsandbox.Spec
+ f, err := os.Open(filepath.Join(request.BundlePath, "config.json"))
+ if err != nil {
+ return nil, err
+ }
+ if err := json.NewDecoder(f).Decode(&sandboxSpec); err != nil {
+ _ = f.Close()
+ return nil, err
+ }
+ _ = f.Close()
+
+ // Decode the runtime options.
+ shimOpts, err := vmutils.UnmarshalRuntimeOptions(ctx, request.Options)
+ if err != nil {
+ return nil, err
+ }
+
+ // We take a lock at this point so that if there are multiple parallel calls to CreateSandbox,
+ // only one will succeed in creating the sandbox. The successful caller will set the sandboxID,
+ // which will cause the other call(s) to fail with an error indicating that a sandbox already exists.
+ s.mu.Lock()
+ defer s.mu.Unlock()
+
+ if s.sandboxID != "" {
+ return nil, fmt.Errorf("sandbox already exists with ID %s", s.sandboxID)
+ }
+
+ hcsDocument, sandboxOptions, err := lcow.BuildSandboxConfig(ctx, ShimName, request.BundlePath, shimOpts, &sandboxSpec)
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse sandbox spec: %w", err)
+ }
+
+ s.sandboxOptions = sandboxOptions
+
+ err = s.vmController.CreateVM(ctx, &vm.CreateOptions{
+ ID: fmt.Sprintf("%s@vm", request.SandboxID),
+ HCSDocument: hcsDocument,
+ })
+ if err != nil {
+ return nil, fmt.Errorf("failed to create VM: %w", err)
+ }
+
+ // By setting the sandboxID here, we ensure that any parallel calls for CreateSandbox
+ // will fail with an error.
+ // Also, setting it here acts as a synchronization point - we know that if sandboxID is set,
+ // then the VM has been created successfully and sandboxOptions has been populated.
+ s.sandboxID = request.SandboxID
+
+ return &sandbox.CreateSandboxResponse{}, nil
+}
+
+// startSandboxInternal is the implementation for StartSandbox.
+//
+// It instructs the vmController to start the VM. If the
+// sandbox was created with confidential settings, confidential options are
+// applied to the VM after starting.
+func (s *Service) startSandboxInternal(ctx context.Context, request *sandbox.StartSandboxRequest) (*sandbox.StartSandboxResponse, error) {
+ if s.sandboxID != request.SandboxID {
+ return nil, fmt.Errorf("sandbox ID mismatch, expected %s, got %s", s.sandboxID, request.SandboxID)
+ }
+
+ // If we successfully got past the above check, it means the sandbox was created and
+ // the sandboxOptions should be populated.
+ var confidentialOpts *guestresource.ConfidentialOptions
+ if s.sandboxOptions != nil && s.sandboxOptions.ConfidentialConfig != nil {
+ uvmReferenceInfoEncoded, err := vmutils.ParseUVMReferenceInfo(
+ ctx,
+ vmutils.DefaultLCOWOSBootFilesPath(),
+ s.sandboxOptions.ConfidentialConfig.UvmReferenceInfoFile,
+ )
+ if err != nil {
+ return nil, fmt.Errorf("failed to parse UVM reference info: %w", err)
+ }
+ confidentialOpts = &guestresource.ConfidentialOptions{
+ EnforcerType: s.sandboxOptions.ConfidentialConfig.SecurityPolicyEnforcer,
+ EncodedSecurityPolicy: s.sandboxOptions.ConfidentialConfig.SecurityPolicy,
+ EncodedUVMReference: uvmReferenceInfoEncoded,
+ }
+ }
+
+ // VM controller ensures that only once of the Start call goes through.
+ err := s.vmController.StartVM(ctx, &vm.StartOptions{
+ GCSServiceID: winio.VsockServiceID(prot.LinuxGcsVsockPort),
+ ConfidentialOptions: confidentialOpts,
+ })
+ if err != nil {
+ return nil, fmt.Errorf("failed to start VM: %w", err)
+ }
+
+ return &sandbox.StartSandboxResponse{
+ CreatedAt: timestamppb.New(s.vmController.StartTime()),
+ }, nil
+}
+
+// platformInternal is the implementation for Platform.
+//
+// It returns the guest OS and CPU architecture for the sandbox.
+// An error is returned if the sandbox is not currently in the created state.
+func (s *Service) platformInternal(_ context.Context, request *sandbox.PlatformRequest) (*sandbox.PlatformResponse, error) {
+ if s.sandboxID != request.SandboxID {
+ return nil, fmt.Errorf("sandbox ID mismatch, expected %s, got %s", s.sandboxID, request.SandboxID)
+ }
+
+ if s.vmController.State() == vm.StateNotCreated {
+ return nil, fmt.Errorf("sandbox has not been created (state: %s)", s.vmController.State())
+ }
+
+ return &sandbox.PlatformResponse{
+ Platform: &types.Platform{
+ OS: linuxPlatform,
+ Architecture: s.sandboxOptions.Architecture,
+ },
+ }, nil
+}
+
+// stopSandboxInternal is the implementation for StopSandbox.
+//
+// It terminates the VM and performs any cleanup, if needed.
+func (s *Service) stopSandboxInternal(ctx context.Context, request *sandbox.StopSandboxRequest) (*sandbox.StopSandboxResponse, error) {
+ if s.sandboxID != request.SandboxID {
+ return nil, fmt.Errorf("sandbox ID mismatch, expected %s, got %s", s.sandboxID, request.SandboxID)
+ }
+
+ err := s.vmController.TerminateVM(ctx)
+ if err != nil {
+ return nil, fmt.Errorf("failed to terminate VM: %w", err)
+ }
+
+ return &sandbox.StopSandboxResponse{}, nil
+}
+
+// waitSandboxInternal is the implementation for WaitSandbox.
+//
+// It blocks until the underlying VM has been terminated, then maps the exit status
+// to a sandbox exit code.
+func (s *Service) waitSandboxInternal(ctx context.Context, request *sandbox.WaitSandboxRequest) (*sandbox.WaitSandboxResponse, error) {
+ if s.sandboxID != request.SandboxID {
+ return nil, fmt.Errorf("sandbox ID mismatch, expected %s, got %s", s.sandboxID, request.SandboxID)
+ }
+
+ // Wait for the VM to be terminated, then return the exit code.
+ // This is a blocking call that will wait until the VM is stopped.
+ err := s.vmController.Wait(ctx)
+ if err != nil {
+ return nil, fmt.Errorf("failed to wait for VM: %w", err)
+ }
+
+ exitStatus, err := s.vmController.ExitStatus()
+ if err != nil {
+ return nil, fmt.Errorf("failed to get sandbox exit status: %w", err)
+ }
+
+ exitStatusCode := 0
+ // If there was an exit error, set a non-zero exit status.
+ if exitStatus.Err != nil {
+ exitStatusCode = int(windows.ERROR_INTERNAL_ERROR)
+ }
+
+ return &sandbox.WaitSandboxResponse{
+ ExitStatus: uint32(exitStatusCode),
+ ExitedAt: timestamppb.New(exitStatus.StoppedTime),
+ }, nil
+}
+
+// sandboxStatusInternal is the implementation for SandboxStatus.
+//
+// It synthesizes a status response from the current vmController state.
+// When verbose is true, the response may be extended with additional
+// diagnostic information.
+func (s *Service) sandboxStatusInternal(_ context.Context, request *sandbox.SandboxStatusRequest) (*sandbox.SandboxStatusResponse, error) {
+ if s.sandboxID != request.SandboxID {
+ return nil, fmt.Errorf("sandbox ID mismatch, expected %s, got %s", s.sandboxID, request.SandboxID)
+ }
+
+ resp := &sandbox.SandboxStatusResponse{
+ SandboxID: request.SandboxID,
+ }
+
+ switch vmState := s.vmController.State(); vmState {
+ case vm.StateNotCreated, vm.StateCreated, vm.StateInvalid:
+ // VM has not started yet or is in invalid state; return the default not-ready response.
+ resp.State = SandboxStateNotReady
+ return resp, nil
+ case vm.StateRunning:
+ // VM is running, so we can report the created time and ready state.
+ resp.State = SandboxStateReady
+ resp.CreatedAt = timestamppb.New(s.vmController.StartTime())
+ case vm.StateTerminated:
+ // VM has stopped, so we can report the created time, exited time, and not-ready state.
+ resp.State = SandboxStateNotReady
+ resp.CreatedAt = timestamppb.New(s.vmController.StartTime())
+ stoppedStatus, err := s.vmController.ExitStatus()
+ if err != nil {
+ return nil, fmt.Errorf("failed to get sandbox stopped status: %w", err)
+ }
+ resp.ExitedAt = timestamppb.New(stoppedStatus.StoppedTime)
+ }
+
+ if request.Verbose { //nolint:staticcheck
+ // TODO: Add compat info and any other details.
+ }
+
+ return resp, nil
+}
+
+// pingSandboxInternal is the implementation for PingSandbox.
+//
+// Ping is not yet implemented for this shim.
+func (s *Service) pingSandboxInternal(_ context.Context, _ *sandbox.PingRequest) (*sandbox.PingResponse, error) {
+ // This functionality is not yet applicable for this shim.
+ // Best scenario, we can return true if the VM is running.
+ return nil, errdefs.ErrNotImplemented
+}
+
+// shutdownSandboxInternal is used to trigger sandbox shutdown when the shim receives
+// a shutdown request from containerd.
+//
+// The sandbox must already be in the stopped state before shutdown is accepted.
+func (s *Service) shutdownSandboxInternal(ctx context.Context, request *sandbox.ShutdownSandboxRequest) (*sandbox.ShutdownSandboxResponse, error) {
+ if s.sandboxID != request.SandboxID {
+ return &sandbox.ShutdownSandboxResponse{}, fmt.Errorf("sandbox ID mismatch, expected %s, got %s", s.sandboxID, request.SandboxID)
+ }
+
+ // Ensure the VM is terminated. If the VM is already terminated,
+ // TerminateVM is a no-op, so this is safe to call regardless of the current VM state.
+ if state := s.vmController.State(); state != vm.StateTerminated {
+ err := s.vmController.TerminateVM(ctx)
+ if err != nil {
+ // Just log the error instead of returning it since this is a best effort cleanup.
+ log.G(ctx).WithError(err).Error("failed to terminate VM during shutdown")
+ }
+ }
+
+ // With gRPC/TTRPC, the transport later creates a child context for each incoming request,
+ // and cancels that context when the handler returns or the client-side connection is dropped.
+ // For the shutdown request, if we call shutdown.Shutdown() directly, the shim process exits
+ // prior to the response being sent back to containerd, which causes the shutdown call to fail.
+ // Therefore, use a goroutine to wait for the RPC context to be done after which
+ // we can safely call shutdown.Shutdown() without risking an early process exit.
+ go func() {
+ <-ctx.Done()
+ time.Sleep(20 * time.Millisecond) // tiny cushion to avoid edge races
+
+ s.shutdown.Shutdown()
+ }()
+
+ return &sandbox.ShutdownSandboxResponse{}, nil
+}
+
+// sandboxMetricsInternal is the implementation for SandboxMetrics.
+//
+// It collects and returns runtime statistics from the vmController.
+func (s *Service) sandboxMetricsInternal(ctx context.Context, request *sandbox.SandboxMetricsRequest) (*sandbox.SandboxMetricsResponse, error) {
+ if s.sandboxID != request.SandboxID {
+ return &sandbox.SandboxMetricsResponse{}, fmt.Errorf("sandbox ID mismatch, expected %s, got %s", s.sandboxID, request.SandboxID)
+ }
+
+ stats, err := s.vmController.Stats(ctx)
+ if err != nil {
+ return &sandbox.SandboxMetricsResponse{}, fmt.Errorf("failed to get sandbox metrics: %w", err)
+ }
+
+ anyStat, err := typeurl.MarshalAny(stats)
+ if err != nil {
+ return &sandbox.SandboxMetricsResponse{}, fmt.Errorf("failed to marshal sandbox metrics: %w", err)
+ }
+
+ return &sandbox.SandboxMetricsResponse{
+ Metrics: &types.Metric{
+ Timestamp: timestamppb.Now(),
+ ID: request.SandboxID,
+ Data: typeurl.MarshalProto(anyStat),
+ },
+ }, nil
+}
diff --git a/cmd/containerd-shim-lcow-v2/service/service_shimdiag.go b/cmd/containerd-shim-lcow-v2/service/service_shimdiag.go
new file mode 100644
index 0000000000..503982d59e
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/service/service_shimdiag.go
@@ -0,0 +1,97 @@
+//go:build windows
+
+package service
+
+import (
+ "context"
+ "os"
+ "strings"
+
+ "github.com/Microsoft/hcsshim/internal/logfields"
+ "github.com/Microsoft/hcsshim/internal/oc"
+ "github.com/Microsoft/hcsshim/internal/shimdiag"
+
+ "github.com/containerd/errdefs/pkg/errgrpc"
+ "go.opencensus.io/trace"
+)
+
+// Ensure Service implements the ShimDiagService interface at compile time.
+var _ shimdiag.ShimDiagService = &Service{}
+
+// DiagExecInHost executes a process in the host namespace for diagnostic purposes.
+// This method is part of the instrumentation layer and business logic is included in diagExecInHostInternal.
+func (s *Service) DiagExecInHost(ctx context.Context, request *shimdiag.ExecProcessRequest) (resp *shimdiag.ExecProcessResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "DiagExecInHost")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.Args, strings.Join(request.Args, " ")),
+ trace.StringAttribute(logfields.Workdir, request.Workdir),
+ trace.BoolAttribute(logfields.Terminal, request.Terminal),
+ trace.StringAttribute(logfields.Stdin, request.Stdin),
+ trace.StringAttribute(logfields.Stdout, request.Stdout),
+ trace.StringAttribute(logfields.Stderr, request.Stderr))
+
+ r, e := s.diagExecInHostInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// DiagTasks returns information about all tasks in the shim.
+// This method is part of the instrumentation layer and business logic is included in diagTasksInternal.
+func (s *Service) DiagTasks(ctx context.Context, request *shimdiag.TasksRequest) (resp *shimdiag.TasksResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "DiagTasks")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.BoolAttribute(logfields.Execs, request.Execs))
+
+ r, e := s.diagTasksInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// DiagShare shares a directory from the host into the sandbox.
+// This method is part of the instrumentation layer and business logic is included in diagShareInternal.
+func (s *Service) DiagShare(ctx context.Context, request *shimdiag.ShareRequest) (resp *shimdiag.ShareResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "DiagShare")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.HostPath, request.HostPath),
+ trace.StringAttribute(logfields.UVMPath, request.UvmPath),
+ trace.BoolAttribute(logfields.ReadOnly, request.ReadOnly))
+
+ r, e := s.diagShareInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// DiagStacks returns the stack traces of all goroutines in the shim.
+// This method is part of the instrumentation layer and business logic is included in diagStacksInternal.
+func (s *Service) DiagStacks(ctx context.Context, request *shimdiag.StacksRequest) (resp *shimdiag.StacksResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "DiagStacks")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(trace.StringAttribute(logfields.SandboxID, s.sandboxID))
+
+ r, e := s.diagStacksInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// DiagPid returns the process ID (PID) of the shim for diagnostic purposes.
+func (s *Service) DiagPid(ctx context.Context, _ *shimdiag.PidRequest) (resp *shimdiag.PidResponse, err error) {
+ _, span := oc.StartSpan(ctx, "DiagPid")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(trace.StringAttribute(logfields.SandboxID, s.sandboxID))
+
+ return &shimdiag.PidResponse{
+ Pid: int32(os.Getpid()),
+ }, nil
+}
diff --git a/cmd/containerd-shim-lcow-v2/service/service_shimdiag_internal.go b/cmd/containerd-shim-lcow-v2/service/service_shimdiag_internal.go
new file mode 100644
index 0000000000..a835ade320
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/service/service_shimdiag_internal.go
@@ -0,0 +1,35 @@
+//go:build windows
+
+package service
+
+import (
+ "context"
+ "fmt"
+
+ "github.com/Microsoft/hcsshim/internal/shimdiag"
+ "github.com/containerd/errdefs"
+)
+
+// diagExecInHostInternal is the implementation for DiagExecInHost.
+//
+// It is used to create an exec session into the hosting UVM.
+func (s *Service) diagExecInHostInternal(ctx context.Context, request *shimdiag.ExecProcessRequest) (*shimdiag.ExecProcessResponse, error) {
+ ec, err := s.vmController.ExecIntoHost(ctx, request)
+ if err != nil {
+ return nil, fmt.Errorf("failed to exec into host: %w", err)
+ }
+
+ return &shimdiag.ExecProcessResponse{ExitCode: int32(ec)}, nil
+}
+
+func (s *Service) diagTasksInternal(_ context.Context, _ *shimdiag.TasksRequest) (*shimdiag.TasksResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) diagShareInternal(_ context.Context, _ *shimdiag.ShareRequest) (*shimdiag.ShareResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) diagStacksInternal(_ context.Context, _ *shimdiag.StacksRequest) (*shimdiag.StacksResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
diff --git a/cmd/containerd-shim-lcow-v2/service/service_task.go b/cmd/containerd-shim-lcow-v2/service/service_task.go
new file mode 100644
index 0000000000..f7f7dda5af
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/service/service_task.go
@@ -0,0 +1,339 @@
+//go:build windows
+
+package service
+
+import (
+ "context"
+
+ "github.com/Microsoft/hcsshim/internal/logfields"
+ "github.com/Microsoft/hcsshim/internal/oc"
+
+ "github.com/containerd/containerd/api/runtime/task/v3"
+ "github.com/containerd/errdefs/pkg/errgrpc"
+ "go.opencensus.io/trace"
+ "google.golang.org/protobuf/types/known/emptypb"
+)
+
+// Ensure Service implements the TTRPCTaskService interface at compile time.
+var _ task.TTRPCTaskService = &Service{}
+
+// State returns the current state of a task or process.
+// This method is part of the instrumentation layer and business logic is included in stateInternal.
+func (s *Service) State(ctx context.Context, request *task.StateRequest) (resp *task.StateResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "State")
+ defer span.End()
+ defer func() {
+ if resp != nil {
+ span.AddAttributes(
+ trace.StringAttribute(logfields.Status, resp.Status.String()),
+ trace.Int64Attribute(logfields.ExitStatus, int64(resp.ExitStatus)),
+ trace.StringAttribute(logfields.ExitedAt, resp.ExitedAt.String()))
+ }
+ oc.SetSpanStatus(span, err)
+ }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID),
+ trace.StringAttribute(logfields.ExecID, request.ExecID))
+
+ r, e := s.stateInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Create creates a new task.
+// This method is part of the instrumentation layer and business logic is included in createInternal.
+func (s *Service) Create(ctx context.Context, request *task.CreateTaskRequest) (resp *task.CreateTaskResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "Create")
+ defer span.End()
+ defer func() {
+ if resp != nil {
+ span.AddAttributes(trace.Int64Attribute(logfields.ProcessID, int64(resp.Pid)))
+ }
+ oc.SetSpanStatus(span, err)
+ }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID),
+ trace.StringAttribute(logfields.Bundle, request.Bundle),
+ trace.BoolAttribute(logfields.Terminal, request.Terminal),
+ trace.StringAttribute(logfields.Stdin, request.Stdin),
+ trace.StringAttribute(logfields.Stdout, request.Stdout),
+ trace.StringAttribute(logfields.Stderr, request.Stderr),
+ trace.StringAttribute(logfields.Checkpoint, request.Checkpoint),
+ trace.StringAttribute(logfields.ParentCheckpoint, request.ParentCheckpoint))
+
+ r, e := s.createInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Start starts a previously created task.
+// This method is part of the instrumentation layer and business logic is included in startInternal.
+func (s *Service) Start(ctx context.Context, request *task.StartRequest) (resp *task.StartResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "Start")
+ defer span.End()
+ defer func() {
+ if resp != nil {
+ span.AddAttributes(trace.Int64Attribute(logfields.ProcessID, int64(resp.Pid)))
+ }
+ oc.SetSpanStatus(span, err)
+ }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID),
+ trace.StringAttribute(logfields.ExecID, request.ExecID))
+
+ r, e := s.startInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Delete deletes a task and returns its exit status.
+// This method is part of the instrumentation layer and business logic is included in deleteInternal.
+func (s *Service) Delete(ctx context.Context, request *task.DeleteRequest) (resp *task.DeleteResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "Delete")
+ defer span.End()
+ defer func() {
+ if resp != nil {
+ span.AddAttributes(
+ trace.Int64Attribute(logfields.ProcessID, int64(resp.Pid)),
+ trace.Int64Attribute(logfields.ExitStatus, int64(resp.ExitStatus)),
+ trace.StringAttribute(logfields.ExitedAt, resp.ExitedAt.String()))
+ }
+ oc.SetSpanStatus(span, err)
+ }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID),
+ trace.StringAttribute(logfields.ExecID, request.ExecID))
+
+ r, e := s.deleteInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Pids returns all process IDs for a task.
+// This method is part of the instrumentation layer and business logic is included in pidsInternal.
+func (s *Service) Pids(ctx context.Context, request *task.PidsRequest) (resp *task.PidsResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "Pids")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID))
+
+ r, e := s.pidsInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Pause pauses a task.
+// This method is part of the instrumentation layer and business logic is included in pauseInternal.
+func (s *Service) Pause(ctx context.Context, request *task.PauseRequest) (resp *emptypb.Empty, err error) {
+ ctx, span := oc.StartSpan(ctx, "Pause")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID))
+
+ r, e := s.pauseInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Resume resumes a previously paused task.
+// This method is part of the instrumentation layer and business logic is included in resumeInternal.
+func (s *Service) Resume(ctx context.Context, request *task.ResumeRequest) (resp *emptypb.Empty, err error) {
+ ctx, span := oc.StartSpan(ctx, "Resume")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID))
+
+ r, e := s.resumeInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Checkpoint creates a checkpoint of a task.
+// This method is part of the instrumentation layer and business logic is included in checkpointInternal.
+func (s *Service) Checkpoint(ctx context.Context, request *task.CheckpointTaskRequest) (resp *emptypb.Empty, err error) {
+ ctx, span := oc.StartSpan(ctx, "Checkpoint")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID),
+ trace.StringAttribute(logfields.Path, request.Path))
+
+ r, e := s.checkpointInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Kill sends a signal to a task or process.
+// This method is part of the instrumentation layer and business logic is included in killInternal.
+func (s *Service) Kill(ctx context.Context, request *task.KillRequest) (resp *emptypb.Empty, err error) {
+ ctx, span := oc.StartSpan(ctx, "Kill")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID),
+ trace.StringAttribute(logfields.ExecID, request.ExecID),
+ trace.Int64Attribute(logfields.Signal, int64(request.Signal)),
+ trace.BoolAttribute(logfields.All, request.All))
+
+ r, e := s.killInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Exec executes an additional process inside a task.
+// This method is part of the instrumentation layer and business logic is included in execInternal.
+func (s *Service) Exec(ctx context.Context, request *task.ExecProcessRequest) (resp *emptypb.Empty, err error) {
+ ctx, span := oc.StartSpan(ctx, "Exec")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID),
+ trace.StringAttribute(logfields.ExecID, request.ExecID),
+ trace.BoolAttribute(logfields.Terminal, request.Terminal),
+ trace.StringAttribute(logfields.Stdin, request.Stdin),
+ trace.StringAttribute(logfields.Stdout, request.Stdout),
+ trace.StringAttribute(logfields.Stderr, request.Stderr))
+
+ r, e := s.execInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// ResizePty resizes the terminal of a process.
+// This method is part of the instrumentation layer and business logic is included in resizePtyInternal.
+func (s *Service) ResizePty(ctx context.Context, request *task.ResizePtyRequest) (resp *emptypb.Empty, err error) {
+ ctx, span := oc.StartSpan(ctx, "ResizePty")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID),
+ trace.StringAttribute(logfields.ExecID, request.ExecID),
+ trace.Int64Attribute(logfields.Width, int64(request.Width)),
+ trace.Int64Attribute(logfields.Height, int64(request.Height)))
+
+ r, e := s.resizePtyInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// CloseIO closes the IO for a process.
+// This method is part of the instrumentation layer and business logic is included in closeIOInternal.
+func (s *Service) CloseIO(ctx context.Context, request *task.CloseIORequest) (resp *emptypb.Empty, err error) {
+ ctx, span := oc.StartSpan(ctx, "CloseIO")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID),
+ trace.StringAttribute(logfields.ExecID, request.ExecID),
+ trace.BoolAttribute(logfields.Stdin, request.Stdin))
+
+ r, e := s.closeIOInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Update updates a running task with new resource constraints.
+// This method is part of the instrumentation layer and business logic is included in updateInternal.
+func (s *Service) Update(ctx context.Context, request *task.UpdateTaskRequest) (resp *emptypb.Empty, err error) {
+ ctx, span := oc.StartSpan(ctx, "Update")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID))
+
+ r, e := s.updateInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Wait waits for a task or process to exit.
+// This method is part of the instrumentation layer and business logic is included in waitInternal.
+func (s *Service) Wait(ctx context.Context, request *task.WaitRequest) (resp *task.WaitResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "Wait")
+ defer span.End()
+ defer func() {
+ if resp != nil {
+ span.AddAttributes(
+ trace.Int64Attribute(logfields.ExitStatus, int64(resp.ExitStatus)),
+ trace.StringAttribute(logfields.ExitedAt, resp.ExitedAt.String()))
+ }
+ oc.SetSpanStatus(span, err)
+ }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID),
+ trace.StringAttribute(logfields.ExecID, request.ExecID))
+
+ r, e := s.waitInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Stats returns resource usage statistics for a task.
+// This method is part of the instrumentation layer and business logic is included in statsInternal.
+func (s *Service) Stats(ctx context.Context, request *task.StatsRequest) (resp *task.StatsResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "Stats")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID))
+
+ r, e := s.statsInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Connect reconnects to a running task.
+// This method is part of the instrumentation layer and business logic is included in connectInternal.
+func (s *Service) Connect(ctx context.Context, request *task.ConnectRequest) (resp *task.ConnectResponse, err error) {
+ ctx, span := oc.StartSpan(ctx, "Connect")
+ defer span.End()
+ defer func() {
+ if resp != nil {
+ span.AddAttributes(
+ trace.Int64Attribute(logfields.ShimPid, int64(resp.ShimPid)),
+ trace.Int64Attribute(logfields.TaskPid, int64(resp.TaskPid)),
+ trace.StringAttribute(logfields.Version, resp.Version))
+ }
+ oc.SetSpanStatus(span, err)
+ }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID))
+
+ r, e := s.connectInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
+
+// Shutdown gracefully shuts down the Service.
+// This method is part of the instrumentation layer and business logic is included in shutdownInternal.
+func (s *Service) Shutdown(ctx context.Context, request *task.ShutdownRequest) (resp *emptypb.Empty, err error) {
+ ctx, span := oc.StartSpan(ctx, "Shutdown")
+ defer span.End()
+ defer func() { oc.SetSpanStatus(span, err) }()
+
+ span.AddAttributes(
+ trace.StringAttribute(logfields.SandboxID, s.sandboxID),
+ trace.StringAttribute(logfields.ID, request.ID))
+
+ r, e := s.shutdownInternal(ctx, request)
+ return r, errgrpc.ToGRPC(e)
+}
diff --git a/cmd/containerd-shim-lcow-v2/service/service_task_internal.go b/cmd/containerd-shim-lcow-v2/service/service_task_internal.go
new file mode 100644
index 0000000000..254199873b
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/service/service_task_internal.go
@@ -0,0 +1,79 @@
+//go:build windows
+
+package service
+
+import (
+ "context"
+
+ "github.com/containerd/containerd/api/runtime/task/v3"
+ "github.com/containerd/errdefs"
+ "google.golang.org/protobuf/types/known/emptypb"
+)
+
+func (s *Service) stateInternal(_ context.Context, _ *task.StateRequest) (*task.StateResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) createInternal(_ context.Context, _ *task.CreateTaskRequest) (*task.CreateTaskResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) startInternal(_ context.Context, _ *task.StartRequest) (*task.StartResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) deleteInternal(_ context.Context, _ *task.DeleteRequest) (*task.DeleteResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) pidsInternal(_ context.Context, _ *task.PidsRequest) (*task.PidsResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) pauseInternal(_ context.Context, _ *task.PauseRequest) (*emptypb.Empty, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) resumeInternal(_ context.Context, _ *task.ResumeRequest) (*emptypb.Empty, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) checkpointInternal(_ context.Context, _ *task.CheckpointTaskRequest) (*emptypb.Empty, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) killInternal(_ context.Context, _ *task.KillRequest) (*emptypb.Empty, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) execInternal(_ context.Context, _ *task.ExecProcessRequest) (*emptypb.Empty, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) resizePtyInternal(_ context.Context, _ *task.ResizePtyRequest) (*emptypb.Empty, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) closeIOInternal(_ context.Context, _ *task.CloseIORequest) (*emptypb.Empty, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) updateInternal(_ context.Context, _ *task.UpdateTaskRequest) (*emptypb.Empty, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) waitInternal(_ context.Context, _ *task.WaitRequest) (*task.WaitResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) statsInternal(_ context.Context, _ *task.StatsRequest) (*task.StatsResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) connectInternal(_ context.Context, _ *task.ConnectRequest) (*task.ConnectResponse, error) {
+ return nil, errdefs.ErrNotImplemented
+}
+
+func (s *Service) shutdownInternal(_ context.Context, _ *task.ShutdownRequest) (*emptypb.Empty, error) {
+ return nil, errdefs.ErrNotImplemented
+}
diff --git a/cmd/containerd-shim-lcow-v2/versioninfo.json b/cmd/containerd-shim-lcow-v2/versioninfo.json
new file mode 100644
index 0000000000..11316902d5
--- /dev/null
+++ b/cmd/containerd-shim-lcow-v2/versioninfo.json
@@ -0,0 +1,44 @@
+{
+ "FixedFileInfo": {
+ "FileVersion": {
+ "Major": 1,
+ "Minor": 0,
+ "Patch": 0,
+ "Build": 0
+ },
+ "ProductVersion": {
+ "Major": 1,
+ "Minor": 0,
+ "Patch": 0,
+ "Build": 0
+ },
+ "FileFlagsMask": "3f",
+ "FileFlags ": "00",
+ "FileOS": "040004",
+ "FileType": "01",
+ "FileSubType": "00"
+ },
+ "StringFileInfo": {
+ "Comments": "",
+ "CompanyName": "Microsoft",
+ "FileDescription": "",
+ "FileVersion": "",
+ "InternalName": "",
+ "LegalCopyright": "",
+ "LegalTrademarks": "",
+ "OriginalFilename": "containerd-shim-lcow-v2.exe",
+ "PrivateBuild": "",
+ "ProductName": "lcow shim",
+ "ProductVersion": "v1.0.0.0",
+ "SpecialBuild": ""
+ },
+ "VarFileInfo": {
+ "Translation": {
+ "LangID": "0409",
+ "CharsetID": "04B0"
+ }
+ },
+ "IconPath": "",
+ "ManifestPath": "containerd-shim-lcow-v2.exe.manifest"
+}
+
diff --git a/internal/controller/vm/doc.go b/internal/controller/vm/doc.go
new file mode 100644
index 0000000000..304e117157
--- /dev/null
+++ b/internal/controller/vm/doc.go
@@ -0,0 +1,76 @@
+//go:build windows
+
+// Package vm provides a controller for managing the lifecycle of a Utility VM (UVM).
+//
+// A Utility VM is a lightweight virtual machine used to host Linux (LCOW) or
+// Windows (WCOW) containers. This package abstracts the VM lifecycle —
+// creation, startup, stats collection, and termination — behind the [Controller]
+// interface, with [Manager] as the primary implementation.
+//
+// # Lifecycle
+//
+// A VM follows the state machine below.
+//
+// ┌─────────────────┐
+// │ StateNotCreated │
+// └────────┬────────┘
+// │ CreateVM ok
+// ▼
+// ┌─────────────────┐ StartVM fails /
+// │ StateCreated │──────── TerminateVM fails ──────┐
+// └──┬─────┬────────┘ │
+// │ │ StartVM ok ▼
+// │ ▼ ┌───────────────┐
+// │ ┌─────────────────┐ TerminateVM │ StateInvalid │
+// │ │ StateRunning │───── fails ──────►│ │
+// │ └────────┬────────┘ └───────┬───────┘
+// │ │ VM exits / │ TerminateVM ok
+// TerminateVM ok │ TerminateVM ok │
+// │ ▼ ▼
+// │ ┌─────────────────────────────────────────────────┐
+// └─►│ StateTerminated │
+// └─────────────────────────────────────────────────┘
+//
+// State descriptions:
+//
+// - [StateNotCreated]: initial state after [NewController] is called.
+// - [StateCreated]: after [Controller.CreateVM] succeeds; the VM exists but has not started.
+// - [StateRunning]: after [Controller.StartVM] succeeds; the guest OS is up and the
+// Guest Compute Service (GCS) connection is established.
+// - [StateTerminated]: terminal state reached after the VM exits naturally or
+// [Controller.TerminateVM] completes successfully.
+// - [StateInvalid]: error state entered when [Controller.StartVM] fails after the underlying
+// HCS VM has already started, or when [Controller.TerminateVM] fails during uvm.Close
+// (from either [StateCreated] or [StateRunning]).
+// A VM in this state can only be cleaned up by calling [Controller.TerminateVM].
+//
+// # Platform Variants
+//
+// Certain behaviors differ between LCOW and WCOW guests and are implemented in
+// platform-specific source files selected via build tags (default for lcow shim and "wcow" tag for wcow shim).
+//
+// # Usage
+//
+// ctrl := vm.NewController()
+//
+// if err := ctrl.CreateVM(ctx, &vm.CreateOptions{
+// ID: "my-uvm",
+// HCSDocument: doc,
+// }); err != nil {
+// // handle error
+// }
+//
+// if err := ctrl.StartVM(ctx, &vm.StartOptions{
+// GCSServiceID: serviceGUID,
+// }); err != nil {
+// // handle error
+// }
+//
+// // ... use ctrl.Guest() for guest interactions ...
+//
+// if err := ctrl.TerminateVM(ctx); err != nil {
+// // handle error
+// }
+//
+// _ = ctrl.Wait(ctx)
+package vm
diff --git a/internal/controller/vm/interface.go b/internal/controller/vm/interface.go
new file mode 100644
index 0000000000..3629e21e33
--- /dev/null
+++ b/internal/controller/vm/interface.go
@@ -0,0 +1,88 @@
+//go:build windows
+
+package vm
+
+import (
+ "context"
+ "time"
+
+ "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats"
+ hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
+ "github.com/Microsoft/hcsshim/internal/protocol/guestresource"
+ "github.com/Microsoft/hcsshim/internal/shimdiag"
+ "github.com/Microsoft/hcsshim/internal/vm/guestmanager"
+
+ "github.com/Microsoft/go-winio/pkg/guid"
+)
+
+type Controller interface {
+ // Guest returns the guest manager instance for this VM.
+ Guest() *guestmanager.Guest
+
+ // State returns the current VM state.
+ State() State
+
+ // CreateVM creates and initializes a new VM with the specified options.
+ // This prepares the VM but does not start it.
+ CreateVM(ctx context.Context, opts *CreateOptions) error
+
+ // StartVM starts the created VM with the specified options.
+ // This establishes the guest connection, sets up necessary listeners for
+ // guest-host communication, and transitions the VM to StateRunning.
+ StartVM(context.Context, *StartOptions) error
+
+ // ExecIntoHost executes a command in the running UVM.
+ ExecIntoHost(ctx context.Context, request *shimdiag.ExecProcessRequest) (int, error)
+
+ // DumpStacks dumps the GCS stacks associated with the VM.
+ DumpStacks(ctx context.Context) (string, error)
+
+ // Wait blocks until the VM exits or the context is cancelled.
+ // It also waits for log output processing to complete.
+ Wait(ctx context.Context) error
+
+ Stats(ctx context.Context) (*stats.VirtualMachineStatistics, error)
+
+ TerminateVM(context.Context) error
+
+ // StartTime returns the timestamp when the VM was started.
+ // Returns zero value of time.time, if the VM is not in StateRunning or StateTerminated.
+ StartTime() time.Time
+
+ // ExitStatus returns information about the stopped VM, including when it
+ // stopped and any exit error. Returns an error if the VM is not in StateTerminated.
+ ExitStatus() (*ExitStatus, error)
+}
+
+// CreateOptions contains the configuration needed to create a new VM.
+type CreateOptions struct {
+ // ID specifies the unique identifier for the VM.
+ ID string
+
+ // HCSDocument specifies the HCS schema document used to create the VM.
+ HCSDocument *hcsschema.ComputeSystem
+}
+
+// StartOptions contains the configuration needed to start a VM and establish
+// the Guest Compute Service (GCS) connection.
+type StartOptions struct {
+ // GCSServiceID specifies the GUID for the GCS vsock service.
+ GCSServiceID guid.GUID
+
+ // ConfigOptions specifies additional configuration options for the guest config.
+ ConfigOptions []guestmanager.ConfigOption
+
+ // ConfidentialOptions specifies security policy and confidential computing
+ // options for the VM. This is optional and only used for confidential VMs.
+ ConfidentialOptions *guestresource.ConfidentialOptions
+}
+
+// ExitStatus contains information about a stopped VM's final state.
+type ExitStatus struct {
+ // StoppedTime is the timestamp when the VM stopped.
+ StoppedTime time.Time
+
+ // Err is the error that caused the VM to stop, if any.
+ // This will be nil if the VM exited cleanly.
+ Err error
+}
diff --git a/internal/controller/vm/state.go b/internal/controller/vm/state.go
new file mode 100644
index 0000000000..6e98eb4ae1
--- /dev/null
+++ b/internal/controller/vm/state.go
@@ -0,0 +1,78 @@
+//go:build windows
+
+package vm
+
+// State represents the current state of the VM lifecycle.
+//
+// The normal progression is:
+//
+// StateNotCreated → StateCreated → StateRunning → StateTerminated
+//
+// If an unrecoverable error occurs during [Controller.StartVM] or
+// [Controller.TerminateVM], the VM transitions to [StateInvalid] instead.
+// A VM in [StateInvalid] can only be cleaned up via [Controller.TerminateVM].
+//
+// Full state-transition table:
+//
+// Current State │ Trigger │ Next State
+// ─────────────────┼────────────────────────────────────┼─────────────────
+// StateNotCreated │ CreateVM succeeds │ StateCreated
+// StateCreated │ StartVM succeeds │ StateRunning
+// StateCreated │ TerminateVM succeeds │ StateTerminated
+// StateCreated │ StartVM fails │ StateInvalid
+// StateCreated │ TerminateVM fails │ StateInvalid
+// StateRunning │ VM exits or TerminateVM succeeds │ StateTerminated
+// StateRunning │ TerminateVM fails (uvm.Close) │ StateInvalid
+// StateInvalid │ TerminateVM called │ StateTerminated
+// StateTerminated │ (terminal — no further transitions)│ —
+type State int32
+
+const (
+ // StateNotCreated indicates the VM has not been created yet.
+ // This is the initial state when a Controller is first instantiated via [NewController].
+ // Valid transitions: StateNotCreated → StateCreated (via [Controller.CreateVM])
+ StateNotCreated State = iota
+
+ // StateCreated indicates the VM has been created but not yet started.
+ // Valid transitions:
+ // - StateCreated → StateRunning (via [Controller.StartVM], on success)
+ // - StateCreated → StateTerminated (via [Controller.TerminateVM], on success)
+ // - StateCreated → StateInvalid (via [Controller.StartVM], on failure)
+ StateCreated
+
+ // StateRunning indicates the VM has been started and is running.
+ // The guest OS is up and the Guest Compute Service (GCS) connection is established.
+ // Valid transitions:
+ // - StateRunning → StateTerminated (VM exits naturally or [Controller.TerminateVM] succeeds)
+ // - StateRunning → StateInvalid ([Controller.TerminateVM] fails during uvm.Close)
+ StateRunning
+
+ // StateTerminated indicates the VM has exited or been successfully terminated.
+ // This is a terminal state — once reached, no further state transitions are possible.
+ StateTerminated
+
+ // StateInvalid indicates that an unrecoverable error has occurred.
+ // The VM transitions to this state when:
+ // - [Controller.StartVM] fails after the underlying HCS VM has already started, or
+ // - [Controller.TerminateVM] fails during uvm.Close (from either [StateCreated] or [StateRunning]).
+ // A VM in this state can only be cleaned up by calling [Controller.TerminateVM].
+ StateInvalid
+)
+
+// String returns a human-readable string representation of the VM State.
+func (s State) String() string {
+ switch s {
+ case StateNotCreated:
+ return "NotCreated"
+ case StateCreated:
+ return "Created"
+ case StateRunning:
+ return "Running"
+ case StateTerminated:
+ return "Terminated"
+ case StateInvalid:
+ return "Invalid"
+ default:
+ return "Unknown"
+ }
+}
diff --git a/internal/controller/vm/vm.go b/internal/controller/vm/vm.go
new file mode 100644
index 0000000000..e4c8c42736
--- /dev/null
+++ b/internal/controller/vm/vm.go
@@ -0,0 +1,442 @@
+//go:build windows
+
+package vm
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "sync"
+ "sync/atomic"
+ "time"
+
+ "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/stats"
+ "github.com/Microsoft/hcsshim/internal/cmd"
+ hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
+ "github.com/Microsoft/hcsshim/internal/log"
+ "github.com/Microsoft/hcsshim/internal/logfields"
+ "github.com/Microsoft/hcsshim/internal/shimdiag"
+ "github.com/Microsoft/hcsshim/internal/timeout"
+ "github.com/Microsoft/hcsshim/internal/vm/guestmanager"
+ "github.com/Microsoft/hcsshim/internal/vm/vmmanager"
+ "github.com/Microsoft/hcsshim/internal/vm/vmutils"
+ iwin "github.com/Microsoft/hcsshim/internal/windows"
+ "github.com/containerd/errdefs"
+
+ "github.com/Microsoft/go-winio/pkg/process"
+ "github.com/sirupsen/logrus"
+ "golang.org/x/sync/errgroup"
+ "golang.org/x/sys/windows"
+)
+
+// Manager is the VM controller implementation that manages the lifecycle of a Utility VM
+// and its associated resources.
+type Manager struct {
+ vmID string
+ uvm *vmmanager.UtilityVM
+ guest *guestmanager.Guest
+
+ // vmState tracks the current state of the VM lifecycle.
+ // Access must be guarded by mu.
+ vmState State
+
+ // mu guards the concurrent access to the Manager's fields and operations.
+ mu sync.RWMutex
+
+ // logOutputDone is closed when the GCS log output processing goroutine completes.
+ logOutputDone chan struct{}
+
+ // Handle to the vmmem process associated with this UVM. Used to look up
+ // memory metrics for the UVM.
+ vmmemProcess windows.Handle
+
+ // activeExecCount tracks the number of ongoing ExecIntoHost calls.
+ activeExecCount atomic.Int64
+
+ // isPhysicallyBacked indicates whether the VM is using physical backing for its memory.
+ isPhysicallyBacked bool
+}
+
+// Ensure both the Controller, and it's subset Handle are implemented by Manager.
+var _ Controller = (*Manager)(nil)
+
+// NewController creates a new Manager instance in the [StateNotCreated] state.
+func NewController() *Manager {
+ return &Manager{
+ logOutputDone: make(chan struct{}),
+ vmState: StateNotCreated,
+ }
+}
+
+// Guest returns the guest manager instance for this VM.
+// The guest manager provides access to guest-host communication.
+func (c *Manager) Guest() *guestmanager.Guest {
+ return c.guest
+}
+
+// State returns the current VM state.
+func (c *Manager) State() State {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+
+ return c.vmState
+}
+
+// CreateVM creates the VM using the HCS document and initializes device state.
+func (c *Manager) CreateVM(ctx context.Context, opts *CreateOptions) error {
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Operation, "CreateVM"))
+
+ c.mu.Lock()
+ defer c.mu.Unlock()
+
+ // In case of duplicate CreateVM call for the same controller, we want to fail.
+ if c.vmState != StateNotCreated {
+ return fmt.Errorf("cannot create VM: VM is in incorrect state %s", c.vmState)
+ }
+
+ // Create the VM via vmmanager.
+ uvm, err := vmmanager.Create(ctx, opts.ID, opts.HCSDocument)
+ if err != nil {
+ return fmt.Errorf("failed to create VM: %w", err)
+ }
+
+ // Set the Manager parameters after successful creation.
+ c.vmID = opts.ID
+ c.uvm = uvm
+ // Determine if the VM is physically backed based on the HCS document configuration.
+ // We need this while extracting memory metrics, as some of them are only relevant for physically backed VMs.
+ c.isPhysicallyBacked = !opts.HCSDocument.VirtualMachine.ComputeTopology.Memory.AllowOvercommit
+
+ // Initialize the GuestManager for managing guest interactions.
+ // We will create the guest connection via GuestManager during StartVM.
+ c.guest = guestmanager.New(ctx, uvm)
+
+ c.vmState = StateCreated
+ return nil
+}
+
+// StartVM starts the VM that was previously created via CreateVM.
+// It starts the underlying HCS VM, establishes the GCS connection,
+// and transitions the VM to [StateRunning].
+// On any failure the VM is transitioned to [StateInvalid].
+func (c *Manager) StartVM(ctx context.Context, opts *StartOptions) (err error) {
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Operation, "StartVM"))
+
+ c.mu.Lock()
+ defer c.mu.Unlock()
+
+ // If the VM is already running, we can skip the start operation and just return.
+ // This makes StartVM idempotent in the case of duplicate calls.
+ if c.vmState == StateRunning {
+ return nil
+ }
+ // However, if the VM is in any other state than Created,
+ // we should fail as StartVM is only valid on a created VM.
+ if c.vmState != StateCreated {
+ return fmt.Errorf("cannot start VM: VM is in incorrect state %s", c.vmState)
+ }
+
+ defer func() {
+ if err != nil {
+ // If starting the VM fails, we transition to Invalid state to prevent any further operations on the VM.
+ // The VM can be terminated by invoking TerminateVM.
+ c.vmState = StateInvalid
+ }
+ }()
+
+ // save parent context, without timeout to use for wait.
+ pCtx := ctx
+ // For remaining operations, we expect them to complete within the GCS connection timeout,
+ // otherwise we want to fail.
+ ctx, cancel := context.WithTimeout(pCtx, timeout.GCSConnectionTimeout)
+ log.G(ctx).Debugf("using gcs connection timeout: %s\n", timeout.GCSConnectionTimeout)
+
+ g, gctx := errgroup.WithContext(ctx)
+ defer func() {
+ _ = g.Wait()
+ }()
+ defer cancel()
+
+ // we should set up the necessary listeners for guest-host communication.
+ // The guest needs to connect to predefined vsock ports.
+ // The host must already be listening on these ports before the guest attempts to connect,
+ // otherwise the connection would fail.
+ c.setupEntropyListener(gctx, g)
+ c.setupLoggingListener(gctx, g)
+
+ err = c.uvm.Start(ctx)
+ if err != nil {
+ return fmt.Errorf("failed to start VM: %w", err)
+ }
+
+ // Start waiting on the utility VM in the background.
+ // This goroutine will complete when the VM exits.
+ go c.waitForVMExit(pCtx)
+
+ // Collect any errors from writing entropy or establishing the log
+ // connection.
+ if err = g.Wait(); err != nil {
+ return err
+ }
+
+ err = c.guest.CreateConnection(ctx, opts.GCSServiceID, opts.ConfigOptions...)
+ if err != nil {
+ return fmt.Errorf("failed to create guest connection: %w", err)
+ }
+
+ err = c.finalizeGCSConnection(ctx)
+ if err != nil {
+ return fmt.Errorf("failed to finalize GCS connection: %w", err)
+ }
+
+ // Set the confidential options if applicable.
+ if opts.ConfidentialOptions != nil {
+ if err := c.guest.AddSecurityPolicy(ctx, *opts.ConfidentialOptions); err != nil {
+ return fmt.Errorf("failed to set confidential options: %w", err)
+ }
+ }
+
+ // If all goes well, we can transition the VM to Running state.
+ c.vmState = StateRunning
+
+ return nil
+}
+
+// waitForVMExit blocks until the VM exits and then transitions the VM state to [StateTerminated].
+// This is called in StartVM in a background goroutine.
+func (c *Manager) waitForVMExit(ctx context.Context) {
+ // The original context may have timeout or propagate a cancellation
+ // copy the original to prevent it affecting the background wait go routine
+ ctx = context.WithoutCancel(ctx)
+ _ = c.uvm.Wait(ctx)
+ // Once the VM has exited, attempt to transition to Terminated.
+ // This may be a no-op if TerminateVM already ran concurrently and
+ // transitioned the state first — log the discarded error so that
+ // concurrent-termination races remain observable.
+ c.mu.Lock()
+ if c.vmState != StateTerminated {
+ c.vmState = StateTerminated
+ } else {
+ log.G(ctx).WithField("currentState", c.vmState).Debug("waitForVMExit: state transition to Terminated was a no-op")
+ }
+ c.mu.Unlock()
+}
+
+// ExecIntoHost executes a command in the running UVM.
+func (c *Manager) ExecIntoHost(ctx context.Context, request *shimdiag.ExecProcessRequest) (int, error) {
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Operation, "ExecIntoHost"))
+
+ if request.Terminal && request.Stderr != "" {
+ return -1, fmt.Errorf("if using terminal, stderr must be empty: %w", errdefs.ErrFailedPrecondition)
+ }
+
+ // Validate that the VM is running before allowing exec into it.
+ c.mu.RLock()
+ if c.vmState != StateRunning {
+ c.mu.RUnlock()
+ return -1, fmt.Errorf("cannot exec into VM: VM is in incorrect state %s", c.vmState)
+ }
+ c.mu.RUnlock()
+
+ // Keep a count of active exec sessions.
+ // This will be used to disallow LM with existing exec sessions,
+ // as that can lead to orphaned processes within UVM.
+ c.activeExecCount.Add(1)
+ defer c.activeExecCount.Add(-1)
+
+ cmdReq := &cmd.CmdProcessRequest{
+ Args: request.Args,
+ Workdir: request.Workdir,
+ Terminal: request.Terminal,
+ Stdin: request.Stdin,
+ Stdout: request.Stdout,
+ Stderr: request.Stderr,
+ }
+ return c.guest.ExecIntoUVM(ctx, cmdReq)
+}
+
+// DumpStacks dumps the GCS stacks associated with the VM
+func (c *Manager) DumpStacks(ctx context.Context) (string, error) {
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Operation, "DumpStacks"))
+
+ // Take read lock at this place.
+ // The state change cannot happen until we release the lock,
+ // so we are sure that the state remains consistent throughout the method.
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+
+ // Validate that the VM is running before sending dump stacks request to GCS.
+ if c.vmState != StateRunning {
+ return "", fmt.Errorf("cannot dump stacks: VM is in incorrect state %s", c.vmState)
+ }
+
+ if c.guest.Capabilities().IsDumpStacksSupported() {
+ return c.guest.DumpStacks(ctx)
+ }
+
+ return "", nil
+}
+
+// Wait blocks until the VM exits and all log output processing has completed.
+func (c *Manager) Wait(ctx context.Context) error {
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Operation, "Wait"))
+
+ // Validate that the VM has been created and can be waited on.
+ // Terminated VMs can also be waited on where we return immediately.
+ c.mu.RLock()
+ if c.vmState == StateNotCreated {
+ c.mu.RUnlock()
+ return fmt.Errorf("cannot wait on VM: VM is in incorrect state %s", c.vmState)
+ }
+ c.mu.RUnlock()
+
+ // Wait for the utility VM to exit.
+ // This will be unblocked when the VM exits or if the context is cancelled.
+ err := c.uvm.Wait(ctx)
+
+ // Wait for the log output processing to complete,
+ // which ensures all logs are processed before we return.
+ select {
+ case <-ctx.Done():
+ ctxErr := fmt.Errorf("failed to wait on uvm output processing: %w", ctx.Err())
+ err = errors.Join(err, ctxErr)
+ case <-c.logOutputDone:
+ }
+
+ return err
+}
+
+// Stats returns runtime statistics for the VM including processor runtime and
+// memory usage. The VM must be in [StateRunning].
+func (c *Manager) Stats(ctx context.Context) (*stats.VirtualMachineStatistics, error) {
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Operation, "Stats"))
+
+ // Take read lock at this place.
+ // The state change cannot happen until we release the lock,
+ // so we are sure that the state remains consistent throughout the method.
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+
+ if c.vmState != StateRunning {
+ return nil, fmt.Errorf("cannot get stats: VM is in incorrect state %s", c.vmState)
+ }
+
+ // Initialization of vmmemProcess to calculate stats properly for VA-backed UVMs.
+ if c.vmmemProcess == 0 {
+ vmmemHandle, err := vmutils.LookupVMMEM(ctx, c.uvm.RuntimeID(), &iwin.WinAPI{})
+ if err != nil {
+ return nil, fmt.Errorf("cannot get stats: %w", err)
+ }
+ c.vmmemProcess = vmmemHandle
+ }
+
+ s := &stats.VirtualMachineStatistics{}
+ props, err := c.uvm.PropertiesV2(ctx, hcsschema.PTStatistics, hcsschema.PTMemory)
+ if err != nil {
+ return nil, fmt.Errorf("failed to get VM properties: %w", err)
+ }
+ s.Processor = &stats.VirtualMachineProcessorStatistics{}
+ s.Processor.TotalRuntimeNS = uint64(props.Statistics.Processor.TotalRuntime100ns * 100)
+
+ s.Memory = &stats.VirtualMachineMemoryStatistics{}
+ if !c.isPhysicallyBacked {
+ // The HCS properties does not return sufficient information to calculate
+ // working set size for a VA-backed UVM. To work around this, we instead
+ // locate the vmmem process for the VM, and query that process's working set
+ // instead, which will be the working set for the VM.
+ memCounters, err := process.GetProcessMemoryInfo(c.vmmemProcess)
+ if err != nil {
+ return nil, err
+ }
+ s.Memory.WorkingSetBytes = uint64(memCounters.WorkingSetSize)
+ }
+
+ if props.Memory != nil {
+ if c.isPhysicallyBacked {
+ // If the uvm is physically backed we set the working set to the total amount allocated
+ // to the UVM. AssignedMemory returns the number of 4KB pages. Will always be 4KB
+ // regardless of what the UVMs actual page size is so we don't need that information.
+ s.Memory.WorkingSetBytes = props.Memory.VirtualMachineMemory.AssignedMemory * 4096
+ }
+ s.Memory.VirtualNodeCount = props.Memory.VirtualNodeCount
+ s.Memory.VmMemory = &stats.VirtualMachineMemory{}
+ s.Memory.VmMemory.AvailableMemory = props.Memory.VirtualMachineMemory.AvailableMemory
+ s.Memory.VmMemory.AvailableMemoryBuffer = props.Memory.VirtualMachineMemory.AvailableMemoryBuffer
+ s.Memory.VmMemory.ReservedMemory = props.Memory.VirtualMachineMemory.ReservedMemory
+ s.Memory.VmMemory.AssignedMemory = props.Memory.VirtualMachineMemory.AssignedMemory
+ s.Memory.VmMemory.SlpActive = props.Memory.VirtualMachineMemory.SlpActive
+ s.Memory.VmMemory.BalancingEnabled = props.Memory.VirtualMachineMemory.BalancingEnabled
+ s.Memory.VmMemory.DmOperationInProgress = props.Memory.VirtualMachineMemory.DmOperationInProgress
+ }
+ return s, nil
+}
+
+// TerminateVM forcefully terminates a running VM, closes the guest connection,
+// and releases HCS resources.
+//
+// The context is used for all operations, including waits, so timeouts/cancellations may prevent
+// proper UVM cleanup.
+func (c *Manager) TerminateVM(ctx context.Context) (err error) {
+ ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Operation, "TerminateVM"))
+
+ c.mu.Lock()
+ defer c.mu.Unlock()
+
+ // If the VM has already terminated, we can skip termination and just return.
+ // Alternatively, if the VM was never created, we can also skip termination.
+ // This makes the TerminateVM operation idempotent.
+ if c.vmState == StateTerminated || c.vmState == StateNotCreated {
+ return nil
+ }
+
+ // Best effort attempt to clean up the open vmmem handle.
+ _ = windows.Close(c.vmmemProcess)
+ // Terminate the utility VM. This will also cause the Wait() call in the background goroutine to unblock.
+ _ = c.uvm.Terminate(ctx)
+
+ if err := c.guest.CloseConnection(); err != nil {
+ log.G(ctx).Errorf("close guest connection failed: %s", err)
+ }
+
+ err = c.uvm.Close(ctx)
+ if err != nil {
+ // Transition to Invalid so no further active operations can be performed on the VM.
+ c.vmState = StateInvalid
+ return fmt.Errorf("failed to close utility VM: %w", err)
+ }
+
+ // Set the Terminated status at the end.
+ c.vmState = StateTerminated
+ return nil
+}
+
+// StartTime returns the timestamp when the VM was started.
+// Returns zero value of time.Time if the VM has not yet reached
+// [StateRunning] or [StateTerminated].
+func (c *Manager) StartTime() (startTime time.Time) {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+
+ if c.vmState == StateRunning || c.vmState == StateTerminated {
+ return c.uvm.StartedTime()
+ }
+
+ return startTime
+}
+
+// ExitStatus returns the final status of the VM once it has reached
+// [StateTerminated], including the time it stopped and any exit error.
+// Returns an error if the VM has not yet stopped.
+func (c *Manager) ExitStatus() (*ExitStatus, error) {
+ c.mu.RLock()
+ defer c.mu.RUnlock()
+
+ if c.vmState != StateTerminated {
+ return nil, fmt.Errorf("cannot get exit status: VM is in incorrect state %s", c.vmState)
+ }
+
+ return &ExitStatus{
+ StoppedTime: c.uvm.StoppedTime(),
+ Err: c.uvm.ExitError(),
+ }, nil
+}
diff --git a/internal/controller/vm/vm_lcow.go b/internal/controller/vm/vm_lcow.go
new file mode 100644
index 0000000000..e8c5b51194
--- /dev/null
+++ b/internal/controller/vm/vm_lcow.go
@@ -0,0 +1,98 @@
+//go:build windows && !wcow
+
+package vm
+
+import (
+ "context"
+ "crypto/rand"
+ "fmt"
+ "io"
+
+ "github.com/Microsoft/hcsshim/internal/vm/vmmanager"
+ "github.com/Microsoft/hcsshim/internal/vm/vmutils"
+
+ "github.com/Microsoft/go-winio"
+ "golang.org/x/sync/errgroup"
+)
+
+// setupEntropyListener sets up entropy for LCOW UVMs.
+//
+// Linux VMs require entropy to initialize their random number generators during boot.
+// This method listens on a predefined vsock port and provides cryptographically secure
+// random data to the Linux init process when it connects.
+func (c *Manager) setupEntropyListener(ctx context.Context, group *errgroup.Group) {
+ group.Go(func() error {
+ // The Linux guest will connect to this port during init to receive entropy.
+ entropyConn, err := winio.ListenHvsock(&winio.HvsockAddr{
+ VMID: c.uvm.RuntimeID(),
+ ServiceID: winio.VsockServiceID(vmutils.LinuxEntropyVsockPort),
+ })
+ if err != nil {
+ return fmt.Errorf("failed to listen on hvSocket for entropy: %w", err)
+ }
+
+ // Prepare to provide entropy to the init process in the background. This
+ // must be done in a goroutine since, when using the internal bridge, the
+ // call to Start() will block until the GCS launches, and this cannot occur
+ // until the host accepts and closes the entropy connection.
+ conn, err := vmmanager.AcceptConnection(ctx, c.uvm, entropyConn, true)
+ if err != nil {
+ return fmt.Errorf("failed to accept connection on hvSocket for entropy: %w", err)
+ }
+ defer conn.Close()
+
+ // Write the required amount of entropy to the connection.
+ // The init process will read this data and use it to seed the kernel's
+ // random number generator (CRNG).
+ _, err = io.CopyN(conn, rand.Reader, vmutils.LinuxEntropyBytes)
+ if err != nil {
+ return fmt.Errorf("failed to write entropy to connection: %w", err)
+ }
+
+ return nil
+ })
+}
+
+// setupLoggingListener sets up logging for LCOW UVMs.
+//
+// This method establishes a vsock connection to receive log output from GCS
+// running inside the Linux VM. The logs are parsed and
+// forwarded to the host's logging system for monitoring and debugging.
+func (c *Manager) setupLoggingListener(ctx context.Context, group *errgroup.Group) {
+ group.Go(func() error {
+ // The GCS will connect to this port to stream log output.
+ logConn, err := winio.ListenHvsock(&winio.HvsockAddr{
+ VMID: c.uvm.RuntimeID(),
+ ServiceID: winio.VsockServiceID(vmutils.LinuxLogVsockPort),
+ })
+ if err != nil {
+ close(c.logOutputDone)
+ return fmt.Errorf("failed to listen on hvSocket for logs: %w", err)
+ }
+
+ // Accept the connection from the GCS.
+ conn, err := vmmanager.AcceptConnection(ctx, c.uvm, logConn, true)
+ if err != nil {
+ close(c.logOutputDone)
+ return fmt.Errorf("failed to accept connection on hvSocket for logs: %w", err)
+ }
+
+ // Launch a separate goroutine to process logs for the lifetime of the VM.
+ go func() {
+ // Parse GCS log output and forward it to the host logging system.
+ vmutils.ParseGCSLogrus(c.uvm.ID())(conn)
+
+ // Signal that log output processing has completed.
+ // This allows Wait() to ensure all logs are processed before returning.
+ close(c.logOutputDone)
+ }()
+
+ return nil
+ })
+}
+
+// finalizeGCSConnection finalizes the GCS connection for LCOW VMs.
+// For LCOW, no additional finalization is needed.
+func (c *Manager) finalizeGCSConnection(_ context.Context) error {
+ return nil
+}
diff --git a/internal/controller/vm/vm_wcow.go b/internal/controller/vm/vm_wcow.go
new file mode 100644
index 0000000000..de6053be8e
--- /dev/null
+++ b/internal/controller/vm/vm_wcow.go
@@ -0,0 +1,116 @@
+//go:build windows && wcow
+
+package vm
+
+import (
+ "context"
+ "fmt"
+ "sync"
+
+ "github.com/Microsoft/go-winio"
+ "github.com/Microsoft/hcsshim/internal/gcs/prot"
+ hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2"
+ "github.com/Microsoft/hcsshim/internal/vm/vmmanager"
+ "github.com/Microsoft/hcsshim/internal/vm/vmutils"
+
+ "github.com/sirupsen/logrus"
+ "golang.org/x/net/netutil"
+ "golang.org/x/sync/errgroup"
+)
+
+// setupEntropyListener sets up entropy for WCOW (Windows Containers on Windows) VMs.
+//
+// For WCOW, entropy setup is not required. Windows VMs have their own internal
+// random number generation that does not depend on host-provided entropy.
+// This is a no-op implementation to satisfy the platform-specific interface.
+//
+// For comparison, LCOW VMs require entropy to be provided during boot.
+func (c *Manager) setupEntropyListener(_ context.Context, _ *errgroup.Group) {}
+
+// setupLoggingListener sets up logging for WCOW UVMs.
+//
+// Unlike LCOW, where the log connection must be established before the VM starts,
+// WCOW allows the GCS to connect to the logging socket at any time after the VM
+// is running. This method sets up a persistent listener that can accept connections
+// even if the GCS restarts or reconnects.
+//
+// The listener is configured to accept only one concurrent connection at a time
+// to prevent resource exhaustion, but will accept new connections if the current one is closed.
+// This supports scenarios where the logging service inside the VM needs to restart.
+func (c *Manager) setupLoggingListener(ctx context.Context, _ *errgroup.Group) {
+ // For Windows, the listener can receive a connection later (after VM starts),
+ // so we start the output handler in a goroutine with a non-timeout context.
+ // This allows the output handler to run independently of the VM creation lifecycle.
+ // This is useful for the case when the logging service is restarted.
+ go func() {
+ baseListener, err := winio.ListenHvsock(&winio.HvsockAddr{
+ VMID: c.uvm.RuntimeID(),
+ ServiceID: prot.WindowsLoggingHvsockServiceID,
+ })
+ if err != nil {
+ // Close the output done channel to signal that logging setup
+ // has failed and no logs will be processed.
+ close(c.logOutputDone)
+ logrus.WithError(err).Error("failed to listen for windows logging connections")
+
+ // Return early due to error.
+ return
+ }
+
+ // Use a WaitGroup to track active log processing goroutines.
+ // This ensures we wait for all log processing to complete before closing logOutputDone.
+ var wg sync.WaitGroup
+
+ // Limit the listener to accept at most 1 concurrent connection.
+ limitedListener := netutil.LimitListener(baseListener, 1)
+
+ for {
+ // Accept a connection from the GCS.
+ conn, err := vmmanager.AcceptConnection(context.WithoutCancel(ctx), c.uvm, limitedListener, false)
+ if err != nil {
+ logrus.WithError(err).Error("failed to connect to log socket")
+ break
+ }
+
+ // Launch a goroutine to process logs from this connection.
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ logrus.Info("uvm output handler starting")
+
+ // Parse GCS log output and forward it to the host logging system.
+ // The parser handles logrus-formatted logs from the GCS.
+ vmutils.ParseGCSLogrus(c.uvm.ID())(conn)
+
+ logrus.Info("uvm output handler finished")
+ }()
+ }
+
+ // Wait for all log processing goroutines to complete.
+ wg.Wait()
+
+ // Signal that log output processing has completed.
+ close(c.logOutputDone)
+ }()
+}
+
+// finalizeGCSConnection finalizes the GCS connection for WCOW UVMs.
+// This is called after CreateConnection succeeds and before the VM is considered fully started.
+func (c *Manager) finalizeGCSConnection(ctx context.Context) error {
+ // Prepare the HvSocket address configuration for the external GCS connection.
+ // The LocalAddress is the VM's runtime ID, and the ParentAddress is the
+ // predefined host ID for Windows GCS communication.
+ hvsocketAddress := &hcsschema.HvSocketAddress{
+ LocalAddress: c.uvm.RuntimeID().String(),
+ ParentAddress: prot.WindowsGcsHvHostID.String(),
+ }
+
+ // Update the guest manager with the HvSocket address configuration.
+ // This enables the GCS to establish proper bidirectional communication.
+ err := c.guest.UpdateHvSocketAddress(ctx, hvsocketAddress)
+ if err != nil {
+ return fmt.Errorf("failed to create GCS connection: %w", err)
+ }
+
+ return nil
+}
diff --git a/internal/logfields/fields.go b/internal/logfields/fields.go
index cceb3e2d18..dac5a708e5 100644
--- a/internal/logfields/fields.go
+++ b/internal/logfields/fields.go
@@ -8,12 +8,12 @@ const (
Operation = "operation"
ID = "id"
- SandboxID = "sid"
ContainerID = "cid"
ExecID = "eid"
ProcessID = "pid"
TaskID = "tid"
UVMID = "uvm-id"
+ SandboxID = "sandbox-id"
// networking and IO
@@ -50,6 +50,40 @@ const (
Uint32 = "uint32"
Uint64 = "uint64"
+ // task / process lifecycle
+
+ Bundle = "bundle"
+ Terminal = "terminal"
+ Stdin = "stdin"
+ Stdout = "stdout"
+ Stderr = "stderr"
+ Checkpoint = "checkpoint"
+ ParentCheckpoint = "parent-checkpoint"
+ Status = "status"
+ ExitStatus = "exit-status"
+ ExitedAt = "exited-at"
+ Signal = "signal"
+ All = "all"
+ Width = "width"
+ Height = "height"
+ Version = "version"
+ ShimPid = "shim-pid"
+ TaskPid = "task-pid"
+
+ // sandbox
+
+ NetNsPath = "net-ns-path"
+ Verbose = "verbose"
+
+ // shimdiag
+
+ Args = "args"
+ Workdir = "workdir"
+ HostPath = "host-path"
+ UVMPath = "uvm-path"
+ ReadOnly = "readonly"
+ Execs = "execs"
+
// runhcs
VMShimOperation = "vmshim-op"
diff --git a/internal/vm/vmutils/doc.go b/internal/vm/vmutils/doc.go
index e78e4a5809..31ffb541ca 100644
--- a/internal/vm/vmutils/doc.go
+++ b/internal/vm/vmutils/doc.go
@@ -7,6 +7,6 @@
// (internal/controller). Functions in this package are designed to be decoupled from
// specific UVM implementations.
//
-// This allows different shims (containerd-shim-runhcs-v1, containerd-shim-lcow-v1)
+// This allows different shims (containerd-shim-runhcs-v1, containerd-shim-lcow-v2)
// to share common logic while maintaining their own orchestration patterns.
package vmutils
diff --git a/internal/vm/vmutils/utils.go b/internal/vm/vmutils/utils.go
index cd710a6bc3..f609f975ff 100644
--- a/internal/vm/vmutils/utils.go
+++ b/internal/vm/vmutils/utils.go
@@ -9,7 +9,12 @@ import (
"os"
"path/filepath"
+ runhcsoptions "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options"
"github.com/Microsoft/hcsshim/internal/log"
+
+ "github.com/containerd/typeurl/v2"
+ "github.com/sirupsen/logrus"
+ "google.golang.org/protobuf/types/known/anypb"
)
// ParseUVMReferenceInfo reads the UVM reference info file, and base64 encodes the content if it exists.
@@ -30,3 +35,29 @@ func ParseUVMReferenceInfo(ctx context.Context, referenceRoot, referenceName str
return base64.StdEncoding.EncodeToString(content), nil
}
+
+// UnmarshalRuntimeOptions decodes the runtime options into runhcsoptions.Options.
+// When no options are provided (options == nil) it returns a non-nil,
+// zero-value Options struct.
+func UnmarshalRuntimeOptions(ctx context.Context, options *anypb.Any) (*runhcsoptions.Options, error) {
+ opts := &runhcsoptions.Options{}
+ if options == nil {
+ return opts, nil
+ }
+
+ v, err := typeurl.UnmarshalAny(options)
+ if err != nil {
+ return nil, fmt.Errorf("failed to unmarshal options: %w", err)
+ }
+
+ shimOpts, ok := v.(*runhcsoptions.Options)
+ if !ok {
+ return nil, fmt.Errorf("failed to unmarshal runtime options: expected *runhcsoptions.Options, got %T", v)
+ }
+
+ if entry := log.G(ctx); entry.Logger.IsLevelEnabled(logrus.DebugLevel) {
+ entry.WithField("options", log.Format(ctx, shimOpts)).Debug("parsed runtime options")
+ }
+
+ return shimOpts, nil
+}