diff --git a/api/nvidia/v1/clusterpolicy_types.go b/api/nvidia/v1/clusterpolicy_types.go index 1b2037cc1..417b412ef 100644 --- a/api/nvidia/v1/clusterpolicy_types.go +++ b/api/nvidia/v1/clusterpolicy_types.go @@ -94,6 +94,8 @@ type ClusterPolicySpec struct { CCManager CCManagerSpec `json:"ccManager,omitempty"` // HostPaths defines various paths on the host needed by GPU Operator components HostPaths HostPathsSpec `json:"hostPaths,omitempty"` + // KataSandboxDevicePlugin component spec + KataSandboxDevicePlugin KataDevicePluginSpec `json:"kataSandboxDevicePlugin,omitempty"` } // Runtime defines container runtime type @@ -124,6 +126,16 @@ func (r Runtime) String() string { } } +// SandboxWorkloadsMode defines the mode for sandbox workloads +type SandboxWorkloadsMode string + +const ( + // KubeVirt is the SandboxWorkloadsMode value for enabling KubeVirt based workloads + KubeVirt SandboxWorkloadsMode = "kubevirt" + // Kata is the SandboxWorkloadsMode value for enabling Kata Container based workloads + Kata SandboxWorkloadsMode = "kata" +) + // OperatorSpec describes configuration options for the operator type OperatorSpec struct { // Deprecated: DefaultRuntime is no longer used by the gpu-operator. This is instead, detected at runtime. @@ -198,6 +210,11 @@ type SandboxWorkloadsSpec struct { // +kubebuilder:validation:Enum=container;vm-passthrough;vm-vgpu // +kubebuilder:default=container DefaultWorkload string `json:"defaultWorkload,omitempty"` + // Mode indicates the sandbox mode. Accepted values are "kubevirt" + // and "kata". The default value is "kubevirt". + // +kubebuilder:validation:Enum=kubevirt;kata + // +kubebuilder:default=kubevirt + Mode string `json:"mode,omitempty"` } // PSPSpec describes configuration for PodSecurityPolicies to apply for all Pods @@ -1464,6 +1481,69 @@ type MIGGPUClientsConfigSpec struct { Name string `json:"name,omitempty"` } +// ImageSpec defines shared fields for component images +type ImageSpec struct { + // NVIDIA component image repository + // +kubebuilder:validation:Optional + Repository string `json:"repository,omitempty"` + + // NVIDIA component image name + // +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+ + Image string `json:"image,omitempty"` + + // NVIDIA component image tag + // +kubebuilder:validation:Optional + Version string `json:"version,omitempty"` + + // Image pull policy + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image Pull Policy" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:imagePullPolicy" + ImagePullPolicy string `json:"imagePullPolicy,omitempty"` +} + +// ComponentCommonSpec defines shared fields for components +type ComponentCommonSpec struct { + // Enabled indicates if deployment of NVIDIA component through operator is enabled + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA component deployment through GPU Operator" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch" + Enabled *bool `json:"enabled,omitempty"` + + // Image pull secrets + // +kubebuilder:validation:Optional + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image pull secrets" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret" + ImagePullSecrets []string `json:"imagePullSecrets,omitempty"` + + // Optional: Define resources requests and limits for each pod + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Resource Requirements" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:resourceRequirements" + Resources *ResourceRequirements `json:"resources,omitempty"` + + // Optional: List of arguments + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Arguments" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text" + Args []string `json:"args,omitempty"` + + // Optional: List of environment variables + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables" + // +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text" + Env []EnvVar `json:"env,omitempty"` +} + +// KataDevicePluginSpec defines attributes for the kata device plugin. +// The Kata device plugin is deployed when SandboxWorkloads is enabled, SandboxWorkloads.Mode is "kata", and Enabled is true. +type KataDevicePluginSpec struct { + ImageSpec `json:",inline"` + ComponentCommonSpec `json:",inline"` +} + // KataManagerSpec defines the configuration for the kata-manager which prepares NVIDIA-specific kata runtimes type KataManagerSpec struct { // Enabled indicates if deployment of Kata Manager is enabled @@ -1905,6 +1985,9 @@ func ImagePath(spec interface{}) (string, error) { case *CCManagerSpec: config := spec.(*CCManagerSpec) return imagePath(config.Repository, config.Image, config.Version, "CC_MANAGER_IMAGE") + case *KataDevicePluginSpec: + config := spec.(*KataDevicePluginSpec) + return imagePath(config.Repository, config.Image, config.Version, "KATA_SANDBOX_DEVICE_PLUGIN_IMAGE") default: return "", fmt.Errorf("invalid type to construct image path: %v", v) } @@ -2083,6 +2166,15 @@ func (s *SandboxDevicePluginSpec) IsEnabled() bool { return *s.Enabled } +// IsEnabled returns true if the kata sandbox device plugin is enabled through gpu-operator +func (k *KataDevicePluginSpec) IsEnabled() bool { + if k.Enabled == nil { + // default is false if not specified by user + return false + } + return *k.Enabled +} + // IsEnabled returns true if PodSecurityAdmission configuration is enabled for all gpu-operator pods func (p *PSASpec) IsEnabled() bool { if p.Enabled == nil { diff --git a/api/nvidia/v1/zz_generated.deepcopy.go b/api/nvidia/v1/zz_generated.deepcopy.go index b56ae0612..f65e0648b 100644 --- a/api/nvidia/v1/zz_generated.deepcopy.go +++ b/api/nvidia/v1/zz_generated.deepcopy.go @@ -214,6 +214,7 @@ func (in *ClusterPolicySpec) DeepCopyInto(out *ClusterPolicySpec) { in.KataManager.DeepCopyInto(&out.KataManager) in.CCManager.DeepCopyInto(&out.CCManager) out.HostPaths = in.HostPaths + in.KataSandboxDevicePlugin.DeepCopyInto(&out.KataSandboxDevicePlugin) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClusterPolicySpec. @@ -248,6 +249,46 @@ func (in *ClusterPolicyStatus) DeepCopy() *ClusterPolicyStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ComponentCommonSpec) DeepCopyInto(out *ComponentCommonSpec) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.ImagePullSecrets != nil { + in, out := &in.ImagePullSecrets, &out.ImagePullSecrets + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(ResourceRequirements) + (*in).DeepCopyInto(*out) + } + if in.Args != nil { + in, out := &in.Args, &out.Args + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.Env != nil { + in, out := &in.Env, &out.Env + *out = make([]EnvVar, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ComponentCommonSpec. +func (in *ComponentCommonSpec) DeepCopy() *ComponentCommonSpec { + if in == nil { + return nil + } + out := new(ComponentCommonSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ContainerProbeSpec) DeepCopyInto(out *ContainerProbeSpec) { *out = *in @@ -948,6 +989,21 @@ func (in *HostPathsSpec) DeepCopy() *HostPathsSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ImageSpec) DeepCopyInto(out *ImageSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ImageSpec. +func (in *ImageSpec) DeepCopy() *ImageSpec { + if in == nil { + return nil + } + out := new(ImageSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *InitContainerSpec) DeepCopyInto(out *InitContainerSpec) { *out = *in @@ -968,6 +1024,23 @@ func (in *InitContainerSpec) DeepCopy() *InitContainerSpec { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *KataDevicePluginSpec) DeepCopyInto(out *KataDevicePluginSpec) { + *out = *in + out.ImageSpec = in.ImageSpec + in.ComponentCommonSpec.DeepCopyInto(&out.ComponentCommonSpec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KataDevicePluginSpec. +func (in *KataDevicePluginSpec) DeepCopy() *KataDevicePluginSpec { + if in == nil { + return nil + } + out := new(KataDevicePluginSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *KataManagerSpec) DeepCopyInto(out *KataManagerSpec) { *out = *in diff --git a/assets/state-kata-device-plugin/0100_service_account.yaml b/assets/state-kata-device-plugin/0100_service_account.yaml new file mode 100644 index 000000000..d9ab8878a --- /dev/null +++ b/assets/state-kata-device-plugin/0100_service_account.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nvidia-kata-sandbox-device-plugin + namespace: "FILLED BY THE OPERATOR" diff --git a/assets/state-kata-device-plugin/0200_role.yaml b/assets/state-kata-device-plugin/0200_role.yaml new file mode 100644 index 000000000..945529cae --- /dev/null +++ b/assets/state-kata-device-plugin/0200_role.yaml @@ -0,0 +1,34 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: nvidia-kata-sandbox-device-plugin + namespace: "FILLED BY THE OPERATOR" +rules: +- apiGroups: + - security.openshift.io + resources: + - securitycontextconstraints + verbs: + - use + resourceNames: + - privileged +- apiGroups: + - "" + resources: + - pods + verbs: + - create + - get + - list + - watch + - delete +- apiGroups: + - nfd.k8s-sigs.io + resources: + - nodefeatures + verbs: + - create + - get + - list + - watch + - update diff --git a/assets/state-kata-device-plugin/0210_clusterrole.yaml b/assets/state-kata-device-plugin/0210_clusterrole.yaml new file mode 100644 index 000000000..5307ee8e8 --- /dev/null +++ b/assets/state-kata-device-plugin/0210_clusterrole.yaml @@ -0,0 +1,19 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: nvidia-kata-sandbox-device-plugin +rules: +- apiGroups: + - config.openshift.io + resources: + - clusterversions + verbs: + - get + - list +- apiGroups: + - "" + resources: + - nodes + - pods + verbs: + - "get" diff --git a/assets/state-kata-device-plugin/0300_rolebinding.yaml b/assets/state-kata-device-plugin/0300_rolebinding.yaml new file mode 100644 index 000000000..d242cdaba --- /dev/null +++ b/assets/state-kata-device-plugin/0300_rolebinding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: nvidia-kata-sandbox-device-plugin + namespace: "FILLED BY THE OPERATOR" +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: nvidia-kata-sandbox-device-plugin +subjects: +- kind: ServiceAccount + name: nvidia-kata-sandbox-device-plugin + namespace: "FILLED BY THE OPERATOR" diff --git a/assets/state-kata-device-plugin/0310_clusterrolebinding.yaml b/assets/state-kata-device-plugin/0310_clusterrolebinding.yaml new file mode 100644 index 000000000..5544333ba --- /dev/null +++ b/assets/state-kata-device-plugin/0310_clusterrolebinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: nvidia-kata-sandbox-device-plugin +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: nvidia-kata-sandbox-device-plugin +subjects: +- kind: ServiceAccount + name: nvidia-kata-sandbox-device-plugin + namespace: "FILLED BY THE OPERATOR" diff --git a/assets/state-kata-device-plugin/0410_scc.openshift.yaml b/assets/state-kata-device-plugin/0410_scc.openshift.yaml new file mode 100644 index 000000000..52e390e10 --- /dev/null +++ b/assets/state-kata-device-plugin/0410_scc.openshift.yaml @@ -0,0 +1,37 @@ +# Please edit the object below. Lines beginning with a '#' will be ignored, +# and an empty file will abort the edit. If an error occurs while saving this file will be +# reopened with the relevant failures. +# +allowHostDirVolumePlugin: true +allowHostIPC: false +allowHostNetwork: false +allowHostPID: false +allowHostPorts: false +allowPrivilegeEscalation: true +allowPrivilegedContainer: true +apiVersion: security.openshift.io/v1 +defaultAddCapabilities: null +fsGroup: + type: RunAsAny +groups: +- system:cluster-admins +- system:nodes +- system:masters +kind: SecurityContextConstraints +metadata: + name: nvidia-kata-sandbox-device-plugin +priority: null +readOnlyRootFilesystem: false +requiredDropCapabilities: null +runAsUser: + type: RunAsAny +seLinuxContext: + type: RunAsAny +seccompProfiles: +- '*' +supplementalGroups: + type: RunAsAny +users: +- "FILLED BY THE OPERATOR" +volumes: +- '*' diff --git a/assets/state-kata-device-plugin/0500_daemonset.yaml b/assets/state-kata-device-plugin/0500_daemonset.yaml new file mode 100644 index 000000000..abf929591 --- /dev/null +++ b/assets/state-kata-device-plugin/0500_daemonset.yaml @@ -0,0 +1,82 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + app: nvidia-kata-sandbox-device-plugin-daemonset + name: nvidia-kata-sandbox-device-plugin-daemonset + namespace: "FILLED BY THE OPERATOR" + annotations: + openshift.io/scc: nvidia-kata-sandbox-device-plugin +spec: + selector: + matchLabels: + app: nvidia-kata-sandbox-device-plugin-daemonset + template: + metadata: + labels: + app: nvidia-kata-sandbox-device-plugin-daemonset + spec: + nodeSelector: + nvidia.com/gpu.deploy.kata-sandbox-device-plugin: "true" + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + priorityClassName: system-node-critical + serviceAccountName: nvidia-kata-sandbox-device-plugin + initContainers: + - name: vfio-pci-validation + image: "FILLED BY THE OPERATOR" + command: ['sh', '-c'] + args: + - until [ -f /run/nvidia/validations/workload-type ]; do echo waiting for workload type status file; sleep 5; done; + if [ "$( 0 { + addPullSecrets(&obj.Spec.Template.Spec, config.KataSandboxDevicePlugin.ImagePullSecrets) + } + if config.KataSandboxDevicePlugin.Resources != nil { + for i := range obj.Spec.Template.Spec.Containers { + obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.KataSandboxDevicePlugin.Resources.Requests + obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.KataSandboxDevicePlugin.Resources.Limits + } + } + if len(config.KataSandboxDevicePlugin.Args) > 0 { + obj.Spec.Template.Spec.Containers[0].Args = config.KataSandboxDevicePlugin.Args + } + if len(config.KataSandboxDevicePlugin.Env) > 0 { + for _, env := range config.KataSandboxDevicePlugin.Env { + setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value) + } + } + return nil +} + // TransformDCGMExporter transforms dcgm exporter daemonset with required config as per ClusterPolicy func TransformDCGMExporter(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { // update validation container diff --git a/controllers/object_controls_test.go b/controllers/object_controls_test.go index 65348b949..f6df7340d 100644 --- a/controllers/object_controls_test.go +++ b/controllers/object_controls_test.go @@ -57,6 +57,7 @@ const ( driverAssetsPath = "assets/state-driver/" vGPUManagerAssetsPath = "assets/state-vgpu-manager/" sandboxDevicePluginAssetsPath = "assets/state-sandbox-device-plugin" + kataDevicePluginAssetsPath = "assets/state-kata-device-plugin" devicePluginAssetsPath = "assets/state-device-plugin/" dcgmExporterAssetsPath = "assets/state-dcgm-exporter/" migManagerAssetsPath = "assets/state-mig-manager/" @@ -399,6 +400,24 @@ func testDaemonsetCommon(t *testing.T, cp *gpuv1.ClusterPolicy, component string if err != nil { return nil, fmt.Errorf("unable to get mainCtrImage for sandbox-device-plugin: %v", err) } + case "KataDevicePlugin": + spec = commonDaemonsetSpec{ + repository: cp.Spec.KataSandboxDevicePlugin.Repository, + image: cp.Spec.KataSandboxDevicePlugin.Image, + version: cp.Spec.KataSandboxDevicePlugin.Version, + imagePullPolicy: cp.Spec.KataSandboxDevicePlugin.ImagePullPolicy, + imagePullSecrets: getImagePullSecrets(cp.Spec.KataSandboxDevicePlugin.ImagePullSecrets), + args: cp.Spec.KataSandboxDevicePlugin.Args, + env: cp.Spec.KataSandboxDevicePlugin.Env, + resources: cp.Spec.KataSandboxDevicePlugin.Resources, + } + dsLabel = "nvidia-kata-sandbox-device-plugin-daemonset" + mainCtrName = "nvidia-kata-sandbox-device-plugin-ctr" + manifestFile = filepath.Join(cfg.root, kataDevicePluginAssetsPath) + mainCtrImage, err = gpuv1.ImagePath(&cp.Spec.KataSandboxDevicePlugin) + if err != nil { + return nil, fmt.Errorf("unable to get mainCtrImage for kata-device-plugin: %v", err) + } case "DCGMExporter": spec = commonDaemonsetSpec{ repository: cp.Spec.DCGMExporter.Repository, @@ -901,6 +920,7 @@ func getSandboxDevicePluginTestInput(testCase string) *gpuv1.ClusterPolicy { // Until we create sample ClusterPolicies that have all fields // set, hardcode some default values: + cp.Spec.SandboxWorkloads.Mode = "kubevirt" cp.Spec.SandboxDevicePlugin.Repository = "nvcr.io/nvidia" cp.Spec.SandboxDevicePlugin.Image = "kubevirt-device-plugin" cp.Spec.SandboxDevicePlugin.Version = "v1.1.0" @@ -999,6 +1019,111 @@ func TestSandboxDevicePluginAssets(t *testing.T) { } } +// getKataDevicePluginTestInput returns a ClusterPolicy instance for a particular +// kata device plugin test case. Kata device plugin is implied when sandboxWorkloads.mode is "kata". +func getKataDevicePluginTestInput(testCase string) *gpuv1.ClusterPolicy { + cp := clusterPolicy.DeepCopy() + + cp.Spec.KataSandboxDevicePlugin.Repository = "nvcr.io/nvidia" + cp.Spec.KataSandboxDevicePlugin.Image = "kata-gpu-device-plugin" + cp.Spec.KataSandboxDevicePlugin.Version = "v0.0.1" + clusterPolicyController.sandboxEnabled = true + cp.Spec.SandboxWorkloads.Enabled = boolTrue + cp.Spec.SandboxWorkloads.Mode = "kata" + cp.Spec.KataSandboxDevicePlugin.Enabled = boolTrue + cp.Spec.KataSandboxDevicePlugin.ImagePullSecrets = []string{"ngc-secret"} + + cp.Spec.Validator.Repository = "nvcr.io/nvidia/cloud-native" + cp.Spec.Validator.Image = "gpu-operator-validator" + cp.Spec.Validator.Version = "v1.11.0" + cp.Spec.Validator.ImagePullSecrets = []string{"ngc-secret"} + + switch testCase { + case "default": + // Do nothing + default: + return nil + } + + return cp +} + +// getKataDevicePluginTestOutput returns a map containing expected output for +// kata device plugin test case. +func getKataDevicePluginTestOutput(testCase string) map[string]interface{} { + output := map[string]interface{}{ + "numDaemonsets": 1, + "image": "nvcr.io/nvidia/kata-gpu-device-plugin:v0.0.1", + "imagePullSecret": "ngc-secret", + } + + switch testCase { + case "default": + // Do nothing + default: + return nil + } + + return output +} + +// TestKataDevicePlugin tests that the GPU Operator correctly deploys the kata-device-plugin +// daemonset when sandboxWorkloads.mode is "kata". +func TestKataDevicePlugin(t *testing.T) { + testCases := []struct { + description string + clusterPolicy *gpuv1.ClusterPolicy + output map[string]interface{} + }{ + { + "Default", + getKataDevicePluginTestInput("default"), + getKataDevicePluginTestOutput("default"), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + ds, err := testDaemonsetCommon(t, tc.clusterPolicy, "KataDevicePlugin", tc.output["numDaemonsets"].(int)) + if err != nil { + t.Fatalf("error in testDaemonsetCommon(): %v", err) + } + if ds == nil { + return + } + + image := "" + for _, container := range ds.Spec.Template.Spec.Containers { + if strings.Contains(container.Name, "nvidia-kata-sandbox-device-plugin-ctr") { + image = container.Image + continue + } + } + + require.Equal(t, tc.output["image"], image, "Unexpected configuration for nvidia-kata-sandbox-device-plugin-ctr image") + + // cleanup by deleting all kubernetes objects + err = removeState(&clusterPolicyController, clusterPolicyController.idx-1) + if err != nil { + t.Fatalf("error removing state %v:", err) + } + clusterPolicyController.idx-- + }) + } +} + +func TestKataDevicePluginAssets(t *testing.T) { + manifestPath := filepath.Join(cfg.root, kataDevicePluginAssetsPath) + // add manifests + addState(&clusterPolicyController, manifestPath) + + // create resources + _, err := clusterPolicyController.step() + if err != nil { + t.Errorf("error creating resources: %v", err) + } +} + // getDCGMExporterTestInput return a ClusterPolicy instance for a particular // dcgm-exporter test case. func getDCGMExporterTestInput(testCase string) *gpuv1.ClusterPolicy { diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 828c33640..c6b961886 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -66,16 +66,18 @@ const ( precompiledIdentificationLabelValue = "true" // see bundle/manifests/gpu-operator.clusterserviceversion.yaml // --> ClusterServiceVersion.metadata.annotations.operatorframework.io/suggested-namespace - ocpSuggestedNamespace = "nvidia-gpu-operator" - gpuWorkloadConfigLabelKey = "nvidia.com/gpu.workload.config" - gpuWorkloadConfigContainer = "container" - gpuWorkloadConfigVMPassthrough = "vm-passthrough" - gpuWorkloadConfigVMVgpu = "vm-vgpu" - podSecurityLabelPrefix = "pod-security.kubernetes.io/" - podSecurityLevelPrivileged = "privileged" - driverAutoUpgradeAnnotationKey = "nvidia.com/gpu-driver-upgrade-enabled" - commonDriverDaemonsetName = "nvidia-driver-daemonset" - commonVGPUManagerDaemonsetName = "nvidia-vgpu-manager-daemonset" + ocpSuggestedNamespace = "nvidia-gpu-operator" + gpuWorkloadConfigLabelKey = "nvidia.com/gpu.workload.config" + gpuWorkloadConfigContainer = "container" + gpuWorkloadConfigVMPassthrough = "vm-passthrough" + gpuWorkloadConfigVMVgpu = "vm-vgpu" + kubevirtDevicePluginDeployLabelKey = "nvidia.com/gpu.deploy.sandbox-device-plugin" + kataDevicePluginDeployLabelKey = "nvidia.com/gpu.deploy.kata-sandbox-device-plugin" + podSecurityLabelPrefix = "pod-security.kubernetes.io/" + podSecurityLevelPrivileged = "privileged" + driverAutoUpgradeAnnotationKey = "nvidia.com/gpu-driver-upgrade-enabled" + commonDriverDaemonsetName = "nvidia-driver-daemonset" + commonVGPUManagerDaemonsetName = "nvidia-vgpu-manager-daemonset" ) var ( @@ -117,9 +119,10 @@ var gpuNodeLabels = map[string]string{ } type gpuWorkloadConfiguration struct { - config string - node string - log logr.Logger + config string + sandboxMode string // SandboxWorkloads.Mode (e.g. "kubevirt", "kata") — only affects vm-passthrough labels + node string + log logr.Logger } // OpenShiftDriverToolkit contains the values required to deploy @@ -340,6 +343,30 @@ func getWorkloadConfig(labels map[string]string, sandboxEnabled bool) (string, e return defaultGPUWorkloadConfig, fmt.Errorf("no GPU workload config found") } +// getEffectiveStateLabels returns the state labels to apply for the given workload config and sandbox mode. +// When config is vm-passthrough and mode is "kata", returns labels with kata-device-plugin instead of sandbox-device-plugin. +func getEffectiveStateLabels(config, mode string) map[string]string { + labels, ok := gpuStateLabels[config] + if !ok { + return nil + } + + if config != gpuWorkloadConfigVMPassthrough { + return labels + } + + // update labels for the sandbox modes for passthrough + switch gpuv1.SandboxWorkloadsMode(mode) { + case gpuv1.Kata: + delete(labels, kubevirtDevicePluginDeployLabelKey) + labels[kataDevicePluginDeployLabelKey] = "true" + case gpuv1.KubeVirt: + delete(labels, kataDevicePluginDeployLabelKey) + labels[kubevirtDevicePluginDeployLabelKey] = "true" + } + return labels +} + // removeAllGPUStateLabels removes all gpuStateLabels from the provided map of node labels. // removeAllGPUStateLabels returns true if the labels map has been modified. func removeAllGPUStateLabels(labels map[string]string) bool { @@ -352,6 +379,10 @@ func removeAllGPUStateLabels(labels map[string]string) bool { } } } + if _, ok := labels[kataDevicePluginDeployLabelKey]; ok { + delete(labels, kataDevicePluginDeployLabelKey) + modified = true + } if _, ok := labels[migManagerLabelKey]; ok { delete(labels, migManagerLabelKey) modified = true @@ -375,9 +406,11 @@ func (w *gpuWorkloadConfiguration) updateGPUStateLabels(labels map[string]string // addGPUStateLabels adds GPU state labels needed for the GPU workload configuration. // If a required state label already exists on the node, honor the current value. +// For vm-passthrough, uses kata-device-plugin when mode is "kata", otherwise sandbox-device-plugin. func (w *gpuWorkloadConfiguration) addGPUStateLabels(labels map[string]string) bool { modified := false - for key, value := range gpuStateLabels[w.config] { + effective := getEffectiveStateLabels(w.config, w.sandboxMode) + for key, value := range effective { if _, ok := labels[key]; !ok { w.log.Info("Setting node label", "NodeName", w.node, "Label", key, "Value", value) labels[key] = value @@ -392,23 +425,27 @@ func (w *gpuWorkloadConfiguration) addGPUStateLabels(labels map[string]string) b return modified } -// removeGPUStateLabels removes GPU state labels not needed for the GPU workload configuration +// removeGPUStateLabels removes GPU state labels not needed for the GPU workload configuration. +// Uses effective labels for (config, mode) so vm-passthrough+kata keeps kata-device-plugin, not sandbox-device-plugin. func (w *gpuWorkloadConfiguration) removeGPUStateLabels(labels map[string]string) bool { modified := false - for workloadConfig, labelsMap := range gpuStateLabels { - if workloadConfig == w.config { + effective := getEffectiveStateLabels(w.config, w.sandboxMode) + // Collect all keys that are ever used as state labels (from static map + mode-dependent key) + allStateKeys := make(map[string]bool) + for _, labelsMap := range gpuStateLabels { + for key := range labelsMap { + allStateKeys[key] = true + } + } + allStateKeys[kataDevicePluginDeployLabelKey] = true + for key := range labels { + if !allStateKeys[key] { continue } - for key := range labelsMap { - if _, ok := gpuStateLabels[w.config][key]; ok { - // skip label if it is in the set of states for workloadConfig - continue - } - if _, ok := labels[key]; ok { - w.log.Info("Deleting node label", "NodeName", w.node, "Label", key) - delete(labels, key) - modified = true - } + if _, keep := effective[key]; !keep { + w.log.Info("Deleting node label", "NodeName", w.node, "Label", key) + delete(labels, key) + modified = true } } if w.config != gpuWorkloadConfigContainer { @@ -507,7 +544,8 @@ func (n *ClusterPolicyController) labelGPUNodes() (bool, int, error) { "Error", err, "defaultGPUWorkloadConfig", defaultGPUWorkloadConfig) } n.logger.Info("GPU workload configuration", "NodeName", node.Name, "GpuWorkloadConfig", config) - gpuWorkloadConfig := &gpuWorkloadConfiguration{config, node.Name, n.logger} + mode := n.singleton.Spec.SandboxWorkloads.Mode + gpuWorkloadConfig := &gpuWorkloadConfiguration{config: config, sandboxMode: mode, node: node.Name, log: n.logger} if !hasCommonGPULabel(labels) && hasGPULabels(labels) { n.logger.Info("Node has GPU(s)", "NodeName", node.Name) // label the node with common Nvidia GPU label @@ -844,6 +882,7 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP addState(n, "/opt/gpu-operator/state-sandbox-validation") addState(n, "/opt/gpu-operator/state-vfio-manager") addState(n, "/opt/gpu-operator/state-sandbox-device-plugin") + addState(n, "/opt/gpu-operator/state-kata-device-plugin") addState(n, "/opt/gpu-operator/state-kata-manager") addState(n, "/opt/gpu-operator/state-cc-manager") } @@ -1057,7 +1096,9 @@ func (n ClusterPolicyController) isStateEnabled(stateName string) bool { case "state-node-status-exporter": return clusterPolicySpec.NodeStatusExporter.IsEnabled() case "state-sandbox-device-plugin": - return n.sandboxEnabled && clusterPolicySpec.SandboxDevicePlugin.IsEnabled() + return n.sandboxEnabled && clusterPolicySpec.SandboxDevicePlugin.IsEnabled() && clusterPolicySpec.SandboxWorkloads.Mode == "kubevirt" + case "state-kata-device-plugin": + return n.sandboxEnabled && clusterPolicySpec.KataSandboxDevicePlugin.IsEnabled() && clusterPolicySpec.SandboxWorkloads.Mode == "kata" case "state-kata-manager": return n.sandboxEnabled && clusterPolicySpec.KataManager.IsEnabled() case "state-vfio-manager": diff --git a/controllers/state_manager_test.go b/controllers/state_manager_test.go index 26e3099d4..35c245be0 100644 --- a/controllers/state_manager_test.go +++ b/controllers/state_manager_test.go @@ -229,3 +229,145 @@ func TestValidateClusterPolicySpec(t *testing.T) { }) } } + +func TestGetEffectiveStateLabels(t *testing.T) { + // getEffectiveStateLabels returns labels for workload config and sandbox mode. + // For container and vm-vgpu, mode has no effect. For vm-passthrough, mode selects + // sandbox-device-plugin (kubevirt) vs kata-device-plugin (kata). + t.Run("container", func(t *testing.T) { + got := getEffectiveStateLabels(gpuWorkloadConfigContainer, "kubevirt") + require.NotNil(t, got) + require.Contains(t, got, "nvidia.com/gpu.deploy.device-plugin") + require.Equal(t, "true", got["nvidia.com/gpu.deploy.device-plugin"]) + }) + t.Run("vm-vgpu", func(t *testing.T) { + got := getEffectiveStateLabels(gpuWorkloadConfigVMVgpu, "kata") + require.NotNil(t, got) + require.Contains(t, got, "nvidia.com/gpu.deploy.sandbox-device-plugin") + require.Equal(t, "true", got["nvidia.com/gpu.deploy.sandbox-device-plugin"]) + }) + // vm-passthrough: test kubevirt first (map has sandbox-device-plugin), then kata. + t.Run("vm-passthrough-kubevirt", func(t *testing.T) { + got := getEffectiveStateLabels(gpuWorkloadConfigVMPassthrough, string(gpuv1.KubeVirt)) + require.NotNil(t, got) + require.Contains(t, got, kubevirtDevicePluginDeployLabelKey) + require.Equal(t, "true", got[kubevirtDevicePluginDeployLabelKey]) + require.NotContains(t, got, kataDevicePluginDeployLabelKey) + }) + t.Run("vm-passthrough-kata", func(t *testing.T) { + got := getEffectiveStateLabels(gpuWorkloadConfigVMPassthrough, string(gpuv1.Kata)) + require.NotNil(t, got) + require.Contains(t, got, kataDevicePluginDeployLabelKey) + require.Equal(t, "true", got[kataDevicePluginDeployLabelKey]) + require.NotContains(t, got, kubevirtDevicePluginDeployLabelKey) + }) + t.Run("invalid config", func(t *testing.T) { + got := getEffectiveStateLabels("invalid", "kubevirt") + require.Nil(t, got) + }) +} + +func TestRemoveAllGPUStateLabels(t *testing.T) { + // removeAllGPUStateLabels removes all gpuStateLabels keys plus kata-device-plugin and mig-manager. + t.Run("removes kata device plugin label", func(t *testing.T) { + labels := map[string]string{ + kataDevicePluginDeployLabelKey: "true", + "other": "keep", + } + modified := removeAllGPUStateLabels(labels) + require.True(t, modified) + require.NotContains(t, labels, kataDevicePluginDeployLabelKey) + require.Equal(t, "keep", labels["other"]) + }) + t.Run("removes sandbox deploy label", func(t *testing.T) { + labels := map[string]string{ + kubevirtDevicePluginDeployLabelKey: "true", + } + modified := removeAllGPUStateLabels(labels) + require.True(t, modified) + require.Empty(t, labels[kubevirtDevicePluginDeployLabelKey]) + }) +} + +func TestIsStateEnabled_SandboxAndKataDevicePlugin(t *testing.T) { + boolTrue := ptr.To(true) + boolFalse := ptr.To(false) + tests := []struct { + name string + sandboxEnabled bool + spec gpuv1.ClusterPolicySpec + stateName string + wantEnabled bool + }{ + { + name: "state-sandbox-device-plugin enabled when sandbox+plugin+mode kubevirt", + sandboxEnabled: true, + spec: gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{Enabled: boolTrue, Mode: "kubevirt"}, + SandboxDevicePlugin: gpuv1.SandboxDevicePluginSpec{Enabled: boolTrue}, + }, + stateName: "state-sandbox-device-plugin", + wantEnabled: true, + }, + { + name: "state-sandbox-device-plugin disabled when mode kata", + sandboxEnabled: true, + spec: gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{Enabled: boolTrue, Mode: "kata"}, + SandboxDevicePlugin: gpuv1.SandboxDevicePluginSpec{Enabled: boolTrue}, + }, + stateName: "state-sandbox-device-plugin", + wantEnabled: false, + }, + { + name: "state-kata-device-plugin enabled when sandbox+kata plugin+mode kata", + sandboxEnabled: true, + spec: gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{Enabled: boolTrue, Mode: "kata"}, + KataSandboxDevicePlugin: gpuv1.KataDevicePluginSpec{ComponentCommonSpec: gpuv1.ComponentCommonSpec{Enabled: boolTrue}}, + }, + stateName: "state-kata-device-plugin", + wantEnabled: true, + }, + { + name: "state-kata-device-plugin disabled when mode kubevirt", + sandboxEnabled: true, + spec: gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{Enabled: boolTrue, Mode: "kubevirt"}, + KataSandboxDevicePlugin: gpuv1.KataDevicePluginSpec{ComponentCommonSpec: gpuv1.ComponentCommonSpec{Enabled: boolTrue}}, + }, + stateName: "state-kata-device-plugin", + wantEnabled: false, + }, + { + name: "state-kata-device-plugin disabled when KataSandboxDevicePlugin.Enabled false", + sandboxEnabled: true, + spec: gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{Enabled: boolTrue, Mode: "kata"}, + KataSandboxDevicePlugin: gpuv1.KataDevicePluginSpec{ComponentCommonSpec: gpuv1.ComponentCommonSpec{Enabled: boolFalse}}, + }, + stateName: "state-kata-device-plugin", + wantEnabled: false, + }, + { + name: "state-kata-device-plugin disabled when sandbox workloads disabled", + sandboxEnabled: false, + spec: gpuv1.ClusterPolicySpec{ + SandboxWorkloads: gpuv1.SandboxWorkloadsSpec{Enabled: boolTrue, Mode: "kata"}, + KataSandboxDevicePlugin: gpuv1.KataDevicePluginSpec{ComponentCommonSpec: gpuv1.ComponentCommonSpec{Enabled: boolTrue}}, + }, + stateName: "state-kata-device-plugin", + wantEnabled: false, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + n := ClusterPolicyController{ + singleton: &gpuv1.ClusterPolicy{Spec: tc.spec}, + sandboxEnabled: tc.sandboxEnabled, + } + got := n.isStateEnabled(tc.stateName) + require.Equal(t, tc.wantEnabled, got, "isStateEnabled(%q)", tc.stateName) + }) + } +} diff --git a/controllers/transforms_test.go b/controllers/transforms_test.go index ad93c5b6f..86ade02a2 100644 --- a/controllers/transforms_test.go +++ b/controllers/transforms_test.go @@ -2778,6 +2778,84 @@ func TestTransformSandboxValidator(t *testing.T) { } } +func TestTransformKataDevicePlugin(t *testing.T) { + resources := corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("128Mi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("50m"), + corev1.ResourceMemory: resource.MustParse("64Mi"), + }, + } + testCases := []struct { + description string + ds Daemonset + cpSpec *gpuv1.ClusterPolicySpec + expectedDs Daemonset + }{ + { + description: "transform kata device plugin", + ds: NewDaemonset(). + WithInitContainer(corev1.Container{Name: "vfio-pci-validation"}). + WithContainer(corev1.Container{Name: "nvidia-kata-sandbox-device-plugin-ctr"}), + cpSpec: &gpuv1.ClusterPolicySpec{ + Validator: gpuv1.ValidatorSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "gpu-operator-validator", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + ImagePullSecrets: []string{"pull-secret"}, + }, + KataSandboxDevicePlugin: gpuv1.KataDevicePluginSpec{ + ImageSpec: gpuv1.ImageSpec{ + Repository: "nvcr.io/nvidia/cloud-native", + Image: "kata-sandbox-device-plugin", + Version: "v1.0.0", + ImagePullPolicy: "IfNotPresent", + }, + ComponentCommonSpec: gpuv1.ComponentCommonSpec{ + ImagePullSecrets: []string{"pull-secret"}, + Resources: &gpuv1.ResourceRequirements{Limits: resources.Limits, Requests: resources.Requests}, + Args: []string{"--test-flag"}, + Env: []gpuv1.EnvVar{{Name: "foo", Value: "bar"}}, + }, + }, + }, + expectedDs: NewDaemonset(). + WithInitContainer(corev1.Container{ + Name: "vfio-pci-validation", + Image: "nvcr.io/nvidia/cloud-native/gpu-operator-validator:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + SecurityContext: &corev1.SecurityContext{ + RunAsUser: rootUID, + }, + }). + WithContainer(corev1.Container{ + Name: "nvidia-kata-sandbox-device-plugin-ctr", + Image: "nvcr.io/nvidia/cloud-native/kata-sandbox-device-plugin:v1.0.0", + ImagePullPolicy: corev1.PullIfNotPresent, + Args: []string{"--test-flag"}, + Env: []corev1.EnvVar{{Name: "foo", Value: "bar"}}, + Resources: resources, + }). + WithPullSecret("pull-secret"), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + err := TransformKataDevicePlugin(tc.ds.DaemonSet, tc.cpSpec, ClusterPolicyController{ + runtime: gpuv1.Containerd, + logger: ctrl.Log.WithName("test"), + }) + require.NoError(t, err) + require.EqualValues(t, tc.expectedDs, tc.ds) + }) + } +} + func TestTransformNodeStatusExporter(t *testing.T) { testCases := []struct { description string diff --git a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml index a3c1358d7..782d5aad9 100644 --- a/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml +++ b/deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml @@ -1610,6 +1610,82 @@ spec: description: Kata Manager image tag type: string type: object + kataSandboxDevicePlugin: + description: KataSandboxDevicePlugin component spec + properties: + args: + description: 'Optional: List of arguments' + items: + type: string + type: array + enabled: + description: Enabled indicates if deployment of NVIDIA component + through operator is enabled + type: boolean + env: + description: 'Optional: List of environment variables' + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. + type: string + value: + description: Value of the environment variable. + type: string + required: + - name + type: object + type: array + image: + description: NVIDIA component image name + pattern: '[a-zA-Z0-9\-]+' + type: string + imagePullPolicy: + description: Image pull policy + type: string + imagePullSecrets: + description: Image pull secrets + items: + type: string + type: array + repository: + description: NVIDIA component image repository + type: string + resources: + description: 'Optional: Define resources requests and limits for + each pod' + properties: + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + version: + description: NVIDIA component image tag + type: string + type: object mig: description: MIG spec properties: @@ -1974,6 +2050,15 @@ spec: Enabled indicates if the GPU Operator should manage additional operands required for sandbox workloads (i.e. VFIO Manager, vGPU Manager, and additional device plugins) type: boolean + mode: + default: kubevirt + description: |- + Mode indicates the sandbox mode. Accepted values are "kubevirt" + and "kata". The default value is "kubevirt". + enum: + - kubevirt + - kata + type: string type: object toolkit: description: Toolkit component spec diff --git a/deployments/gpu-operator/templates/clusterpolicy.yaml b/deployments/gpu-operator/templates/clusterpolicy.yaml index 543089e17..608333946 100644 --- a/deployments/gpu-operator/templates/clusterpolicy.yaml +++ b/deployments/gpu-operator/templates/clusterpolicy.yaml @@ -688,6 +688,9 @@ spec: {{- if .Values.sandboxWorkloads.defaultWorkload }} defaultWorkload: {{ .Values.sandboxWorkloads.defaultWorkload }} {{- end }} + {{- if .Values.sandboxWorkloads.mode }} + mode: {{ .Values.sandboxWorkloads.mode | quote }} + {{- end }} sandboxDevicePlugin: {{- if .Values.sandboxDevicePlugin.enabled }} enabled: {{ .Values.sandboxDevicePlugin.enabled }} @@ -716,3 +719,31 @@ spec: {{- if .Values.sandboxDevicePlugin.args }} args: {{ toYaml .Values.sandboxDevicePlugin.args | nindent 6 }} {{- end }} + kataSandboxDevicePlugin: + {{- if ne .Values.kataSandboxDevicePlugin.enabled nil }} + enabled: {{ .Values.kataSandboxDevicePlugin.enabled }} + {{- end }} + {{- if .Values.kataSandboxDevicePlugin.repository }} + repository: {{ .Values.kataSandboxDevicePlugin.repository }} + {{- end }} + {{- if .Values.kataSandboxDevicePlugin.image }} + image: {{ .Values.kataSandboxDevicePlugin.image }} + {{- end }} + {{- if .Values.kataSandboxDevicePlugin.version }} + version: {{ .Values.kataSandboxDevicePlugin.version | quote }} + {{- end }} + {{- if .Values.kataSandboxDevicePlugin.imagePullPolicy }} + imagePullPolicy: {{ .Values.kataSandboxDevicePlugin.imagePullPolicy }} + {{- end }} + {{- if .Values.kataSandboxDevicePlugin.imagePullSecrets }} + imagePullSecrets: {{ toYaml .Values.kataSandboxDevicePlugin.imagePullSecrets | nindent 6 }} + {{- end }} + {{- if .Values.kataSandboxDevicePlugin.resources }} + resources: {{ toYaml .Values.kataSandboxDevicePlugin.resources | nindent 6 }} + {{- end }} + {{- if .Values.kataSandboxDevicePlugin.env }} + env: {{ toYaml .Values.kataSandboxDevicePlugin.env | nindent 6 }} + {{- end }} + {{- if .Values.kataSandboxDevicePlugin.args }} + args: {{ toYaml .Values.kataSandboxDevicePlugin.args | nindent 6 }} + {{- end }} diff --git a/deployments/gpu-operator/values.yaml b/deployments/gpu-operator/values.yaml index fbdbb43e7..383005949 100644 --- a/deployments/gpu-operator/values.yaml +++ b/deployments/gpu-operator/values.yaml @@ -19,6 +19,8 @@ cdi: sandboxWorkloads: enabled: false defaultWorkload: "container" + # Sandbox mode: "kubevirt" (default) or "kata". When "kata", the Kata device plugin is deployed on vm-passthrough nodes. + mode: "kubevirt" hostPaths: # rootFS represents the path to the root filesystem of the host. @@ -508,6 +510,18 @@ sandboxDevicePlugin: env: [] resources: {} +# Kata sandbox device plugin (used when sandboxWorkloads.mode is "kata"). +kataSandboxDevicePlugin: + enabled: true + repository: ghcr.io/nvidia + image: nvidia-sandbox-device-plugin + version: "3a8894b2" + imagePullPolicy: IfNotPresent + imagePullSecrets: [] + args: [] + env: [] + resources: {} + ccManager: enabled: true defaultMode: "on"