Merge pull request #34 from oracle-quickstart/rg_plugin_helm

amgowda-oci · web-flow · commit 5db436676b7d · 2025-11-04T08:38:32.000-08:00
Node problem detector integration into helm charts
diff --git a/GETTING_STARTED_HELM_DEPLOY.md b/GETTING_STARTED_HELM_DEPLOY.md
@@ -232,7 +232,7 @@ helm install lens ./helm -n lens \
 
 ## Step 2: OCI GPU Data Plane Plugin installation on GPU Nodes
 
-**NOTE** : Running data control plane plugin as a Kubernetes native plugin running daemon sets for [AMD MI300X nodes can be found here](./oci-scanner-plugin-amd-helm/README.md). Nvidia offering as a daemon is coming soon. Issue#22
+**NOTE** : Running data control plane plugin as a Kubernetes native plugin running daemon sets for [AMD and Nvidia  nodes can be found here](./oci-scanner-plugin-helm/README.md). Supported GPUs are: MI300x, MI355x, A10, H100 and B200.
 
 1. **Navigate to Dashboards**: Go to the dashboard section of the OCI GPU Scanner Portal
 2. **Go to Tab - OCI GPU Scanner Install Script**:
diff --git a/OKE_NPD_DEPLOY.md b/OKE_NPD_DEPLOY.md
diff --git a/oci-scanner-plugin-helm/README.md b/oci-scanner-plugin-helm/README.md
@@ -12,6 +12,7 @@ Multi-vendor GPU monitoring and health check solution for OCI compute instances
 - **Pod Node Mapper**: Pod-to-node relationship tracking
 - **Health Check**: GPU performance testing (optional)
 - **DRHPC**: Distributed diagnostic monitoring for both AMD and NVIDIA
+- **Node Problem Detector**: GPU health monitoring via DRHPC integration (requires labeling)
 
 ## Configuration
 
@@ -27,6 +28,11 @@ helm install oci-gpu-scanner-plugin . -f values.yaml -n oci-gpu-scanner-plugin \
 helm install oci-gpu-scanner-plugin ./oci-scanner-plugin-amd-helm \
   --set healthCheck.enabled=true
 
+# Enable Node Problem Detector (requires node labeling and drhpc to be enabled- see below)
+helm upgrade oci-gpu-scanner-plugin . \
+  --set nodeProblemDetector.enabled=true \
+  --set drhpc.enabled=true
+
 # Uninstall
 helm uninstall oci-gpu-scanner-plugin -n oci-gpu-scanner-plugin
 ```
@@ -36,4 +42,45 @@ helm uninstall oci-gpu-scanner-plugin -n oci-gpu-scanner-plugin
 - Kubernetes cluster with AMD / Nvidia GPU nodes
 - Prometheus Push Gateway accessible from cluster
 - AMD GPU drivers installed on nodes
-- Nvidia GPU Drivers installed on the nodes
+- Nvidia GPU Drivers installed on the nodes
+
+# Installing and Using OKE Node Problem Detector (NPD) DaemonSet with OCI GPU Scanner Service
+
+OKE NPD is an extension of https://github.com/kubernetes/node-problem-detector that processes GPU health check failures reported by GPU Scanner service and sets conditions on the affected nodes. This feature enables proactive monitoring of GPU node health and early detection of issues. 
+
+
+**IMPORTANT**: The Node Problem Detector will only work on GPU nodes that are labeled with `oci.oraclecloud.com/oke-node-problem-detector-enabled="true"`. NPD will only start processing GPU health check events when drhpc is running on the nodes, so ensure that it is enabled when you install the helm chart.
+
+Before enabling NPD, label your GPU nodes:
+
+```bash
+# Label individual nodes
+kubectl label nodes <node-name> oci.oraclecloud.com/oke-node-problem-detector-enabled=true
+
+# Label all AMD GPU nodes
+kubectl label nodes --selector=amd.com/gpu=true oci.oraclecloud.com/oke-node-problem-detector-enabled=true
+
+# Label all NVIDIA GPU nodes
+kubectl label nodes --selector=nvidia.com/gpu=true oci.oraclecloud.com/oke-node-problem-detector-enabled=true
+
+# Verify labels
+kubectl get nodes --show-labels | grep oke-node-problem-detector-enabled
+```
+
+Then enable NPD:
+
+```bash
+helm upgrade oci-gpu-scanner-plugin . \
+  --set nodeProblemDetector.enabled=true \
+  --set drhpc.enabled=true
+```
+
+**Note**: NPD requires DRHPC to be enabled and running to provide GPU health check data.
+
+Verify that NPD DaemonSet has been installed successfully and running.
+
+```bash
+kubectl get pods -l app=oke-node-problem-detector -o wide -n kube-system
+```
+
+Results should show ```oke-node-problem-detector``` in running state for all targeted GPU nodes.
diff --git a/oci-scanner-plugin-helm/templates/node-problem-detector.yaml b/oci-scanner-plugin-helm/templates/node-problem-detector.yaml
@@ -1,10 +1,16 @@
+{{- if .Values.nodeProblemDetector.enabled }}
+---
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
   name: oke-node-problem-detector
-  namespace: kube-system
+  namespace: {{ .Values.nodeProblemDetector.namespace | default "kube-system" }}
   labels:
     app: oke-node-problem-detector
+    component: gpu-monitoring
+    {{- with .Values.global.labels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
 spec:
   selector:
     matchLabels:
@@ -13,9 +19,14 @@ spec:
     metadata:
       labels:
         app: oke-node-problem-detector
+        component: gpu-monitoring
     spec:
       nodeSelector:
+        {{- if .Values.nodeProblemDetector.nodeSelector }}
+        {{- toYaml .Values.nodeProblemDetector.nodeSelector | nindent 8 }}
+        {{- else }}
         oci.oraclecloud.com/oke-node-problem-detector-enabled: "true"
+        {{- end }}
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
@@ -30,7 +41,9 @@ spec:
             - /node-problem-detector --logtostderr --prometheus-port=${PROMETHEUS_PORT}
               --prometheus-address 0.0.0.0 --config.system-log-monitor=/config/kernel-monitor.json,/config/readonly-monitor.json
               --config.custom-plugin-monitor=/node-problem-detector-custom-check/imds_reachability.json
+              {{- if .Values.nodeProblemDetector.enableGpuChecks }}
               --config.custom-plugin-monitor=/node-problem-detector-gpu-check/dr-hpc.json
+              {{- end }}
               --enable-k8s-exporter=true
           command:
             - /bin/sh
@@ -42,21 +55,16 @@ spec:
                   apiVersion: v1
                   fieldPath: spec.nodeName
             - name: PROMETHEUS_PORT
-              value: "20257"
-          image: phx.ocir.io/idnlixcmffxd/oke-public-node-problem-detector:v0.8.20.7@sha256:399b506dbfa5c33e60a247d0d3199f025d242b7a7480c956446e70eaa090c599
-          imagePullPolicy: Always
+              value: {{ .Values.nodeProblemDetector.prometheusPort | default "20257" | quote }}
+          image: {{ .Values.nodeProblemDetector.image.repository }}:{{ .Values.nodeProblemDetector.image.tag }}@{{ .Values.nodeProblemDetector.image.sha256 }}
+          imagePullPolicy: {{ .Values.nodeProblemDetector.image.pullPolicy | default "Always" }}
           name: oke-node-problem-detector
           ports:
-            - containerPort: 20257
+            - containerPort: {{ .Values.nodeProblemDetector.prometheusPort | default 20257 }}
               name: metrics
               protocol: TCP
           resources:
-            limits:
-              cpu: 10m
-              memory: 80Mi
-            requests:
-              cpu: 10m
-              memory: 80Mi
+            {{- toYaml .Values.nodeProblemDetector.resources | nindent 12 }}
           securityContext:
             privileged: true
           volumeMounts:
@@ -74,27 +82,17 @@ spec:
             - mountPath: /node-problem-detector-custom-check
               name: node-problem-detector-custom-check
               readOnly: true
+            {{- if .Values.nodeProblemDetector.enableGpuChecks }}
             - mountPath: /node-problem-detector-gpu-check
               name: node-problem-detector-gpu-check
               readOnly: true
+            {{- end }}
       serviceAccountName: oke-node-problem-detector-sa
       tolerations:
-        - key: CriticalAddonsOnly
-          operator: Exists
-        - key: oci.oraclecloud.com/oke-is-preemptible
-          operator: Exists
-        - effect: NoSchedule
-          key: nvidia.com/gpu
-          operator: Exists
-        - effect: NoSchedule
-          key: amd.com/gpu
-          operator: Exists
-        - effect: NoSchedule
-          key: oci.oraclecloud.com/node-auto-repair-scheduled
-          operator: Exists
+        {{- toYaml .Values.nodeProblemDetector.tolerations | nindent 8 }}
       volumes:
         - hostPath:
-            path: /home/ubuntu/oci-dr-hpc-v2/
+            path: {{ .Values.nodeProblemDetector.drhpcResultsPath | default "/var/lib/oci-dr-hpc-v2" }}
           name: log
         - hostPath:
             path: /dev/kmsg
@@ -109,17 +107,19 @@ spec:
             defaultMode: 493
             name: node-problem-detector-custom-check
           name: node-problem-detector-custom-check
+        {{- if .Values.nodeProblemDetector.enableGpuChecks }}
         - configMap:
             defaultMode: 493
             name: node-problem-detector-gpu-check
           name: node-problem-detector-gpu-check
+        {{- end }}
 
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
   name: node-problem-detector-custom-check
-  namespace: kube-system
+  namespace: {{ .Values.nodeProblemDetector.namespace | default "kube-system" }}
 data:
   imds_reachability.sh: |
     #!/bin/bash   
@@ -147,7 +147,6 @@ data:
       exit 1
     fi
 
-
   imds_reachability.json: |
     {
       "plugin": "custom",
@@ -177,12 +176,13 @@ data:
       ]
     }
 
+{{- if .Values.nodeProblemDetector.enableGpuChecks }}
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
   name: node-problem-detector-gpu-check
-  namespace: kube-system
+  namespace: {{ .Values.nodeProblemDetector.namespace | default "kube-system" }}
 data:
   dr_hpc_check.sh: |
     #!/bin/bash
@@ -579,13 +579,14 @@ data:
         }
       ]
     }
+{{- end }}
 
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
   name: oke-node-problem-detector-sa
-  namespace: kube-system
+  namespace: {{ .Values.nodeProblemDetector.namespace | default "kube-system" }}
 
 ---
 apiVersion: rbac.authorization.k8s.io/v1
@@ -599,4 +600,5 @@ roleRef:
 subjects:
   - kind: ServiceAccount
     name: oke-node-problem-detector-sa
-    namespace: kube-system
+    namespace: {{ .Values.nodeProblemDetector.namespace | default "kube-system" }}
+{{- end }}
diff --git a/oci-scanner-plugin-helm/values.yaml b/oci-scanner-plugin-helm/values.yaml
@@ -163,7 +163,7 @@ rbac:
 
 # Node Exporter
 nodeExporter:
-  enabled: true
+  enabled: false
   
 # Override default values for the official chart
 prometheus-node-exporter:
@@ -209,4 +209,66 @@ podNodeMapper:
       cpu: "50m"
     limits:
       memory: "256Mi"
-      cpu: "200m"
+      cpu: "200m"
+
+# Node Problem Detector
+nodeProblemDetector:
+  enabled: false
+  enableGpuChecks: true  # Enable GPU health checks via DRHPC
+  
+  # DRHPC results path - must match the hostPath where DRHPC writes results
+  # Default is /var/lib/oci-dr-hpc-v2 which matches drhpc.resultsHostPath
+  drhpcResultsPath: "/var/lib/oci-dr-hpc-v2"
+  
+  image:
+    repository: phx.ocir.io/idnlixcmffxd/oke-public-node-problem-detector
+    tag: v0.8.20.7
+    sha256: sha256:399b506dbfa5c33e60a247d0d3199f025d242b7a7480c956446e70eaa090c599
+    pullPolicy: Always
+  
+  prometheusPort: 20257
+  
+# Node Problem Detector
+nodeProblemDetector:
+  enabled: false
+  namespace: kube-system
+  
+  # DRHPC results path - must match the hostPath where DRHPC writes results
+  # Default is /var/lib/oci-dr-hpc-v2 which matches drhpc.resultsHostPath
+  drhpcResultsPath: "/var/lib/oci-dr-hpc-v2"
+  
+  image:
+    repository: phx.ocir.io/idnlixcmffxd/oke-public-node-problem-detector
+    tag: v0.8.20.7
+    sha256: sha256:399b506dbfa5c33e60a247d0d3199f025d242b7a7480c956446e70eaa090c599
+    pullPolicy: Always
+  
+  prometheusPort: 20257
+  
+  # Node selector - defaults to OKE node-problem-detector label
+  nodeSelector:
+    oci.oraclecloud.com/oke-node-problem-detector-enabled: "true"
+  
+  # TolenableGpuCheckserations for GPU nodes
+  tolerations:
+    - key: CriticalAddonsOnly
+      operator: Exists
+    - key: oci.oraclecloud.com/oke-is-preemptible
+      operator: Exists
+    - effect: NoSchedule
+      key: nvidia.com/gpu
+      operator: Exists
+    - effect: NoSchedule
+      key: amd.com/gpu
+      operator: Exists
+    - effect: NoSchedule
+      key: oci.oraclecloud.com/node-auto-repair-scheduled
+      operator: Exists
+  
+  resources:
+    requests:
+      cpu: 10m
+      memory: 80Mi
+    limits:
+      cpu: 10m
+      memory: 80Mi