Skip to content

Commit 951c1a7

Browse files
authored
basic checking of container resources (#59)
1 parent acd30fe commit 951c1a7

File tree

3 files changed

+72
-2
lines changed

3 files changed

+72
-2
lines changed

tools/cluster-checker/checker.js

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
'use strict'
22

33
const k8s = require('@kubernetes/client-node')
4+
const k8srp = require('kubernetes-resource-parser')
5+
6+
const nodeResources = {
7+
'nvidia.com/gpu' : 8,
8+
'nvidia.com/roce_gdr' : 2,
9+
'cpu' : 80,
10+
'memory' : '800G'
11+
}
412

513
class Client {
614
constructor () {
@@ -140,6 +148,50 @@ function reservation (pod) {
140148
return gpus
141149
}
142150

151+
// check container resource requests against node_resources
152+
function checkContainerResources(namespace, workload, container) {
153+
// selectively merge limits into requests
154+
const resources = {}
155+
for (const k in container.resources?.requests ?? []) {
156+
resources[k] = container.resources.requests[k]
157+
}
158+
for (const k in container.resources?.limits ?? []) {
159+
if (!(k in resources)) {
160+
resources[k] = container.resources.limits[k]
161+
}
162+
}
163+
164+
const gpus = parseInt(resources['nvidia.com/gpu'] ?? '0')
165+
const gdr = parseInt(resources['nvidia.com/roce_gdr'] ?? '0')
166+
const cpus = k8srp.cpuParser(resources['cpu'] ?? '0')
167+
const mem = k8srp.memoryParser(resources['memory'] ?? '0')
168+
169+
// warn if the resource requests cannot be satisfied by a Node
170+
if (gpus > nodeResources['nvidia.com/gpu']) {
171+
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${gpus} GPUs"`)
172+
}
173+
if (gdr > nodeResources['gdrPerNode']) {
174+
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr interfaces"`)
175+
}
176+
if (cpus > nodeResources['cpu']) {
177+
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting "${cpus} CPUs"`)
178+
}
179+
if (mem > k8srp.memoryParser(nodeResources['memory'])) {
180+
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory`)
181+
}
182+
183+
// warn if the resource:GPU ratio is not proportional to Node resources
184+
if (gdr > 0 && ((gpus == 0) || (gpus/gdr < nodeResources['nvidia.com/gpu']/nodeResources['nvidia.com/roce_gdr']))) {
185+
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${gdr} roce_gdr but only ${gpus} GPUs`)
186+
}
187+
if (gpus > 0 && (cpus > 0) && (cpus/gpus > nodeResources['cpu']/nodeResources['nvidia.com/gpu'])) {
188+
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${cpus} cpus but only ${gpus} GPUs`)
189+
}
190+
if (gpus > 0 && (mem > 0) && (mem/gpus > k8srp.memoryParser(nodeResources['memory'])/nodeResources['nvidia.com/gpu'])) {
191+
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has a container requesting ${resources['memory']} memory but only ${gpus} GPUs`)
192+
}
193+
}
194+
143195
// check user namespace
144196
async function checkUserNamespace (client, namespace, queues) {
145197
const workloads = await client.workloads(namespace.metadata.name)
@@ -171,6 +223,16 @@ async function checkUserNamespace (client, namespace, queues) {
171223
if (conditions['Evicted'] === 'True') {
172224
console.log(`WARNING: workload "${namespace.metadata.name}/${workload.metadata.name}" has condition Evicted=True`)
173225
}
226+
227+
// report misconfigured resource requests
228+
for (const podSet of workload.spec?.podSets) {
229+
for (const ic of podSet.template?.spec?.initContainers ?? []) {
230+
checkContainerResources(namespace, workload, ic)
231+
}
232+
for (const c of podSet.template?.spec?.containers ?? []) {
233+
checkContainerResources(namespace, workload, c)
234+
}
235+
}
174236
}
175237
}
176238

tools/cluster-checker/package-lock.json

Lines changed: 8 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tools/cluster-checker/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
"dependencies": {
3-
"@kubernetes/client-node": "^0.21.0"
3+
"@kubernetes/client-node": "^0.21.0",
4+
"kubernetes-resource-parser": "0.1.0"
45
}
56
}

0 commit comments

Comments
 (0)