From 44b770f634c4ee976e90d0ea7e9086f73e99ed45 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 10 Dec 2025 14:04:57 +0100 Subject: [PATCH] Fix healthchecking on old devices In cases where registering events for a device are not supported, we should not mark the device as unhealthy, but skip the device instead. Signed-off-by: Evan Lezar --- internal/rm/health.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/rm/health.go b/internal/rm/health.go index 1f0fc5c41..46f036bc6 100644 --- a/internal/rm/health.go +++ b/internal/rm/health.go @@ -102,10 +102,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic } ret = gpu.RegisterEvents(eventMask&supportedEvents, eventSet) - if ret == nvml.ERROR_NOT_SUPPORTED { + switch { + case ret == nvml.ERROR_NOT_SUPPORTED: klog.Warningf("Device %v is too old to support healthchecking.", d.ID) - } - if ret != nvml.SUCCESS { + case ret != nvml.SUCCESS: klog.Infof("Marking device %v as unhealthy: %v", d.ID, ret) unhealthy <- d }