Skip to content

Commit 04b8a45

Browse files
committed
Add basic test for failing device iteration
Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent 681ab4e commit 04b8a45

File tree

14 files changed

+31868
-0
lines changed

14 files changed

+31868
-0
lines changed

cmd/gpu-feature-discovery/main_test.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ import (
1515
"testing"
1616
"time"
1717

18+
"github.com/NVIDIA/go-nvml/pkg/nvml"
19+
"github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100"
1820
"github.com/stretchr/testify/require"
1921

2022
spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
@@ -420,6 +422,80 @@ func TestFailOnNVMLInitError(t *testing.T) {
420422
}
421423
}
422424

425+
// TODO: This should be extended to a more representative test.
426+
func TestGFDLabellers(t *testing.T) {
427+
nvmllib := dgxa100.New()
428+
429+
for _, d := range nvmllib.Devices {
430+
// TODO: This is not implemented in the mock.
431+
(d.(*dgxa100.Device)).GetGpuFabricInfoFunc = func() (nvml.GpuFabricInfo, nvml.Return) {
432+
return nvml.GpuFabricInfo{}, nvml.ERROR_NOT_SUPPORTED
433+
}
434+
}
435+
436+
// Force one of the devices to have errors when enumerating the device.
437+
workingDevices := nvmllib.DeviceGetHandleByIndexFunc
438+
nvmllib.DeviceGetHandleByIndexFunc = func(n int) (nvml.Device, nvml.Return) {
439+
if n == 0 {
440+
return nil, nvml.ERROR_INVALID_ARGUMENT
441+
}
442+
return workingDevices(n)
443+
}
444+
445+
cfg := &Config{}
446+
config := &spec.Config{
447+
Flags: spec.Flags{
448+
CommandLineFlags: spec.CommandLineFlags{
449+
DeviceDiscoveryStrategy: ptr("nvml"),
450+
FailOnInitError: ptr(true),
451+
MigStrategy: ptr("none"),
452+
GFD: &spec.GFDCommandLineFlags{
453+
MachineTypeFile: ptr(""),
454+
OutputFile: ptr(""),
455+
},
456+
},
457+
},
458+
}
459+
d, err := newGFDRunner(cfg, nvmllib, config)
460+
require.NoError(t, err)
461+
462+
loopLabelers, err := lm.NewLabelers(d.manager, d.vgpu, d.config)
463+
require.NoError(t, err)
464+
465+
labels, err := loopLabelers.Labels()
466+
require.NoError(t, err)
467+
468+
expectedLabels := map[string]string{
469+
"nvidia.com/cuda.driver-version.full": "550.54.15",
470+
"nvidia.com/cuda.driver-version.major": "550",
471+
"nvidia.com/cuda.driver-version.minor": "54",
472+
"nvidia.com/cuda.driver-version.revision": "15",
473+
"nvidia.com/cuda.driver.major": "550",
474+
"nvidia.com/cuda.driver.minor": "54",
475+
"nvidia.com/cuda.driver.rev": "15",
476+
"nvidia.com/cuda.runtime-version.full": "12.4",
477+
"nvidia.com/cuda.runtime-version.major": "12",
478+
"nvidia.com/cuda.runtime-version.minor": "4",
479+
"nvidia.com/cuda.runtime.major": "12",
480+
"nvidia.com/cuda.runtime.minor": "4",
481+
"nvidia.com/gpu.compute.major": "8",
482+
"nvidia.com/gpu.compute.minor": "0",
483+
"nvidia.com/gpu.count": "7",
484+
"nvidia.com/gpu.family": "ampere",
485+
"nvidia.com/gpu.machine": "unknown",
486+
"nvidia.com/gpu.memory": "40960",
487+
"nvidia.com/gpu.mode": "unknown",
488+
"nvidia.com/gpu.product": "Mock-NVIDIA-A100-SXM4-40GB",
489+
"nvidia.com/gpu.replicas": "1",
490+
"nvidia.com/gpu.sharing-strategy": "none",
491+
"nvidia.com/mig.capable": "true",
492+
"nvidia.com/mps.capable": "false",
493+
}
494+
495+
require.EqualValues(t, expectedLabels, (map[string]string)(labels))
496+
497+
}
498+
423499
func buildLabelMapFromOutput(output []byte) (map[string]string, error) {
424500
labels := make(map[string]string)
425501

vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/computeinstance.go

Lines changed: 105 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)