Skip to content

Commit 2d29a51

Browse files
committed
Add basic test for failing device iteration
Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent 681ab4e commit 2d29a51

File tree

14 files changed

+31879
-0
lines changed

14 files changed

+31879
-0
lines changed

cmd/gpu-feature-discovery/main_test.go

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ import (
1515
"testing"
1616
"time"
1717

18+
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
19+
"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
20+
"github.com/NVIDIA/go-nvml/pkg/nvml"
21+
"github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100"
1822
"github.com/stretchr/testify/require"
1923

2024
spec "github.com/NVIDIA/k8s-device-plugin/api/config/v1"
@@ -420,6 +424,89 @@ func TestFailOnNVMLInitError(t *testing.T) {
420424
}
421425
}
422426

427+
// TODO: This should be extended to a more representative test.
428+
func TestGFDLabellers(t *testing.T) {
429+
nvmllib := dgxa100.New()
430+
431+
for _, d := range nvmllib.Devices {
432+
// TODO: This is not implemented in the mock.
433+
(d.(*dgxa100.Device)).GetGpuFabricInfoFunc = func() (nvml.GpuFabricInfo, nvml.Return) {
434+
return nvml.GpuFabricInfo{}, nvml.ERROR_NOT_SUPPORTED
435+
}
436+
}
437+
438+
// Force one of the devices to have errors when enumerating the device.
439+
workingDevices := nvmllib.DeviceGetHandleByIndexFunc
440+
nvmllib.DeviceGetHandleByIndexFunc = func(n int) (nvml.Device, nvml.Return) {
441+
if n == 0 {
442+
return nil, nvml.ERROR_INVALID_ARGUMENT
443+
}
444+
return workingDevices(n)
445+
}
446+
447+
devicelib := device.New(nvmllib,
448+
device.WithIgnoreVisitDevicesErrors(true),
449+
)
450+
451+
infolib := info.New(
452+
info.WithNvmlLib(nvmllib),
453+
info.WithDeviceLib(devicelib),
454+
)
455+
456+
cfg := &Config{}
457+
config := &spec.Config{
458+
Flags: spec.Flags{
459+
CommandLineFlags: spec.CommandLineFlags{
460+
DeviceDiscoveryStrategy: ptr("nvml"),
461+
FailOnInitError: ptr(true),
462+
MigStrategy: ptr("none"),
463+
GFD: &spec.GFDCommandLineFlags{
464+
MachineTypeFile: ptr(""),
465+
OutputFile: ptr(""),
466+
},
467+
},
468+
},
469+
}
470+
d, err := newGFDRunner(cfg, infolib, nvmllib, devicelib, config)
471+
require.NoError(t, err)
472+
473+
loopLabelers, err := lm.NewLabelers(d.manager, d.vgpu, d.config)
474+
require.NoError(t, err)
475+
476+
labels, err := loopLabelers.Labels()
477+
require.NoError(t, err)
478+
479+
expectedLabels := map[string]string{
480+
"nvidia.com/cuda.driver-version.full": "550.54.15",
481+
"nvidia.com/cuda.driver-version.major": "550",
482+
"nvidia.com/cuda.driver-version.minor": "54",
483+
"nvidia.com/cuda.driver-version.revision": "15",
484+
"nvidia.com/cuda.driver.major": "550",
485+
"nvidia.com/cuda.driver.minor": "54",
486+
"nvidia.com/cuda.driver.rev": "15",
487+
"nvidia.com/cuda.runtime-version.full": "12.4",
488+
"nvidia.com/cuda.runtime-version.major": "12",
489+
"nvidia.com/cuda.runtime-version.minor": "4",
490+
"nvidia.com/cuda.runtime.major": "12",
491+
"nvidia.com/cuda.runtime.minor": "4",
492+
"nvidia.com/gpu.compute.major": "8",
493+
"nvidia.com/gpu.compute.minor": "0",
494+
"nvidia.com/gpu.count": "7",
495+
"nvidia.com/gpu.family": "ampere",
496+
"nvidia.com/gpu.machine": "unknown",
497+
"nvidia.com/gpu.memory": "40960",
498+
"nvidia.com/gpu.mode": "unknown",
499+
"nvidia.com/gpu.product": "Mock-NVIDIA-A100-SXM4-40GB",
500+
"nvidia.com/gpu.replicas": "1",
501+
"nvidia.com/gpu.sharing-strategy": "none",
502+
"nvidia.com/mig.capable": "true",
503+
"nvidia.com/mps.capable": "false",
504+
}
505+
506+
require.EqualValues(t, expectedLabels, (map[string]string)(labels))
507+
508+
}
509+
423510
func buildLabelMapFromOutput(output []byte) (map[string]string, error) {
424511
labels := make(map[string]string)
425512

vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/computeinstance.go

Lines changed: 105 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)