|
1 | 1 | 'use strict' |
2 | 2 |
|
3 | 3 | const k8s = require('@kubernetes/client-node') |
| 4 | +const k8srp = require('kubernetes-resource-parser') |
| 5 | + |
| 6 | +const nodeResources = { |
| 7 | + 'nvidia.com/gpu' : 8, |
| 8 | + 'nvidia.com/roce_gdr' : 2, |
| 9 | + 'cpu' : 80, |
| 10 | + 'memory' : '800G' |
| 11 | +} |
4 | 12 |
|
5 | 13 | class Client { |
6 | 14 | constructor () { |
@@ -140,6 +148,50 @@ function reservation (pod) { |
140 | 148 | return gpus |
141 | 149 | } |
142 | 150 |
|
| 151 | +// check container resource requests against node_resources |
| 152 | +function checkContainerResources(namespace, workload, container) { |
| 153 | + // selectively merge limits into requests |
| 154 | + const resources = {} |
| 155 | + for (const k in container.resources?.requests ?? []) { |
| 156 | + resources[k] = container.resources.requests[k] |
| 157 | + } |
| 158 | + for (const k in container.resources?.limits ?? []) { |
| 159 | + if (!(k in resources)) { |
| 160 | + resources[k] = container.resources.limits[k] |
| 161 | + } |
| 162 | + } |
| 163 | + |
| 164 | + const gpus = parseInt(resources['nvidia.com/gpu'] ?? '0') |
| 165 | + const gdr = parseInt(resources['nvidia.com/roce_gdr'] ?? '0') |
| 166 | + const cpus = k8srp.cpuParser(resources['cpu'] ?? '0') |
| 167 | + const mem = k8srp.memoryParser(resources['memory'] ?? '0') |
| 168 | + |
| 169 | + // warn if the resource requests cannot be satisfied by a Node |
| 170 | + if (gpus > nodeResources['nvidia.com/gpu']) { |
| 171 | + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${gpus} GPUs"`) |
| 172 | + } |
| 173 | + if (gdr > nodeResources['gdrPerNode']) { |
| 174 | + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr interfaces"`) |
| 175 | + } |
| 176 | + if (cpus > nodeResources['cpu']) { |
| 177 | + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${cpus} CPUs"`) |
| 178 | + } |
| 179 | + if (mem > k8srp.memoryParser(nodeResources['memory'])) { |
| 180 | + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory`) |
| 181 | + } |
| 182 | + |
| 183 | + // warn if the resource:GPU ratio is not proportional to Node resources |
| 184 | + if (gdr > 0 && ((gpus == 0) || (gpus/gdr < nodeResources['nvidia.com/gpu']/nodeResources['nvidia.com/roce_gdr']))) { |
| 185 | + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr but only ${gpus} GPUs`) |
| 186 | + } |
| 187 | + if (gpus > 0 && (cpus > 0) && (cpus/gpus > nodeResources['cpu']/nodeResources['nvidia.com/gpu'])) { |
| 188 | + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${cpus} cpus but only ${gpus} GPUs`) |
| 189 | + } |
| 190 | + if (gpus > 0 && (mem > 0) && (mem/gpus > k8srp.memoryParser(nodeResources['memory'])/nodeResources['nvidia.com/gpu'])) { |
| 191 | + console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory but only ${gpus} GPUs`) |
| 192 | + } |
| 193 | +} |
| 194 | + |
143 | 195 | // check user namespace |
144 | 196 | async function checkUserNamespace (client, namespace, queues) { |
145 | 197 | const workloads = await client.workloads(namespace.metadata.name) |
@@ -171,6 +223,16 @@ async function checkUserNamespace (client, namespace, queues) { |
171 | 223 | if (conditions['Evicted'] === 'True') { |
172 | 224 | console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has condition Evicted=True`) |
173 | 225 | } |
| 226 | + |
| 227 | + // report misconfigured resource requests |
| 228 | + for (const podSet of workload.spec?.podSets) { |
| 229 | + for (const ic of podSet.template?.spec?.initContainers ?? []) { |
| 230 | + checkContainerResources(namespace, workload, ic) |
| 231 | + } |
| 232 | + for (const c of podSet.template?.spec?.containers ?? []) { |
| 233 | + checkContainerResources(namespace, workload, c) |
| 234 | + } |
| 235 | + } |
174 | 236 | } |
175 | 237 | } |
176 | 238 |
|
|
0 commit comments